libabigail
Loading...
Searching...
No Matches
abg-diff-utils.h
Go to the documentation of this file.
1// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
2// -*- Mode: C++ -*-
3//
4// Copyright (C) 2013-2024 Red Hat, Inc.
5
6/// @file
7///
8/// This file declares types and operations implementing the "O(ND)
9/// Difference Algorithm" (aka diff2) from Eugene W. Myers, to compute
10/// the difference between two sequences.
11///
12/// To understand what is going on here, one must read the paper at
13/// http://www.xmailserver.org/diff2.pdf. Throughout this file, that
14/// paper is referred to as "the paper".
15///
16/// The implementations goes as far as calculating the shortest edit
17/// script (the set of insertions and deletions) for transforming a
18/// sequence into another. The main entry point for that is the
19/// compute_diff() function.
20
21#ifndef __ABG_DIFF_UTILS_H__
22#define __ABG_DIFF_UTILS_H__
23
24#include <cassert>
25#include <cstdlib>
26#include <memory>
27#include <ostream>
28#include <sstream>
29#include <stdexcept>
30#include <string>
31#include <vector>
32#include "abg-fwd.h"
33
34namespace abigail
35{
36
37/// @brief Libabigail's core diffing algorithms
38///
39/// This is the namespace defining the core diffing algorithm used by
40/// libabigail @ref comparison tools. This based on the diff
41/// algorithm from Eugene Myers.
42namespace diff_utils
43{
44
45using std::shared_ptr;
46
47// Inject the names from std:: below into this namespace
48using std::string;
49using std::ostream;
50using std::vector;
51using std::abs;
52using std::ostringstream;
53
54/// A class representing a vertex in an edit graph, as explained in
55/// the paper. A vertex is a basically a pair of coordinates
56/// (abscissa and ordinate).
57class point
58{
59 int x_;
60 int y_;
61 bool empty_;
62
63public:
64
65 point()
66 : x_(-1), y_(-1),empty_(true)
67 {}
68
69 point(int x, int y)
70 : x_(x), y_(y), empty_(false)
71 {}
72
73 point(const point& p)
74 : x_(p.x()), y_(p.y()), empty_(p.is_empty())
75 {}
76
77 int
78 x() const
79 {return x_;}
80
81 void
82 x(int x)
83 {
84 x_ = x;
85 empty_ = false;
86 }
87
88 int
89 y() const
90 {return y_;}
91
92 void
93 y(int y)
94 {
95 y_ = y;
96 empty_ = false;
97 }
98
99 void
100 set(int x, int y)
101 {
102 x_ = x;
103 y_ = y;
104 empty_ = false;
105 }
106
107 void
108 set(int x, int y, bool empty)
109 {
110 x_ = x;
111 y_ = y;
112 empty_ = empty;
113 }
114
115 void
116 add(int ax, int ay)
117 {set (x() + ax, y() + ay);}
118
119 bool
120 operator!=(const point& o) const
121 {return (x() != o.x() || y() != o.y() || is_empty() != o.is_empty());}
122
123 bool
124 operator==(const point& o) const
125 {return !(operator!=(o));}
126
127 bool
128 operator<(const point& o) const
129 {return (x() < o.x() && y() < o.y());}
130
131 bool
132 operator>(const point& o) const
133 {return (x() > o.x() && y() > o.y());}
134
135 bool
136 operator<=(const point& o) const
137 {return (x() <= o.x() && y() <= o.y());}
138
139 bool
140 operator>=(const point& o) const
141 {return (x() >= o.x() && y() >= o.y());}
142
143 point
144 operator+(int val) const
145 {return point(x() + val, y() + val);}
146
147 point
148 operator-(int val) const
149 {return point(x() - val, y() - val);}
150
151 point&
152 operator+= (int val)
153 {
154 set(x_ + val, y_ + val);
155 return *this;
156 }
157
158 point&
159 operator-= (int val)
160 {return (*this) += (-val);}
161
162 point&
163 operator--()
164 {return (*this) -= 1;}
165
166 point&
167 operator++()
168 {return (*this) += 1;}
169
170 point
171 operator--(int)
172 {
173 point tmp(*this);
174 --(*this);
175 return tmp;
176 }
177
178 point
179 operator++(int)
180 {
181 point tmp(*this);
182 ++(*this);
183 return tmp;
184 }
185
186 point&
187 operator=(int val)
188 {
189 set(val, val);
190 return *this;
191 }
192
193 point&
194 operator=(const point& p)
195 {
196 set(p.x(), p.y(), p.is_empty());
197 return *this;
198 }
199
200 bool
201 is_empty() const
202 {return empty_;}
203
204 operator bool () const
205 {return !is_empty();}
206
207 bool
208 operator!() const
209 {return is_empty();}
210
211 void
212 clear()
213 {
214 x_ = -1;
215 y_ = -1;
216 empty_ = true;
217 }
218};// end point
219
220/// The abstraction of the Snake concept, from the paper.
221///
222/// In a given path from the edit graph, a snake is a non-diagonal
223/// edge followed by zero or more diagonal edges.
224///
225/// The starting poing of the non-diagonal edge is the beginning of
226/// the snake. This is given by the snake::begin() method. This point
227/// is not explicitely referenced in the paper, but we need it for
228/// some grunt implementation details of the algorithm.
229///
230/// The end point of the non-diagonal edge is the intermediate point
231/// of the snake; it's given by the snake::intermediate() method.
232/// This point is what is referred to as "the begining of the snake"
233/// in the paper.
234///
235/// The end point of the first diagonal edge is given by the
236/// snake::diagonal_start() method.
237///
238/// The end point of the last diagonal edge is given by the
239/// snake::end() method. Note that when the snake contains no
240/// diagonal edge, snake::intermediate(), and snake::end() return the
241/// same point; snake::diagonal_start() contains an empty point (i.e,
242/// a point for which point::is_empty() returns true).
243class snake
244{
245 point begin_, intermediate_, diagonal_start_, end_;
246 bool forward_;
247
248public:
249
250 /// Default constructor for snake.
252 : forward_(false)
253 {}
254
255 /// Constructor from the beginning, intermediate and end points.
256 ///
257 /// @param b the beginning point of the snake. That is, the
258 /// starting point of the non-diagonal edge.
259 ///
260 /// @param i the intermediate point of the snake. That is, the end
261 /// point of the non-diagonal edge.
262 ///
263 /// @param e the end point of the snake. That is the end point of
264 /// the last diagonal edge.
265 snake(const point& b,
266 const point& i,
267 const point& e)
268 : begin_(b), intermediate_(i),
269 end_(e), forward_(false)
270 {}
271
272 /// Constructor from the beginning, intermediate and end points.
273 ///
274 /// @param b the beginning point of the snake. That is, the
275 /// starting point of the non-diagonal edge.
276 ///
277 /// @param i the intermediate point of the snake. That is, the end
278 /// point of the non-diagonal edge.
279 ///
280 /// @param d the beginning of the diagonal edge. That is the end of
281 /// the first diagonal edge of the snake.
282 ///
283 /// @param e the end point of the snake. That is the end point of
284 /// the last diagonal edge.
285 snake(const point& b,
286 const point& i,
287 const point& d,
288 const point& e)
289 : begin_(b), intermediate_(i),
290 diagonal_start_(d), end_(e),
291 forward_(false)
292 {}
293
294 /// Getter for the starting point of the non-diagonal edge of the
295 /// snake.
296 ///
297 /// @return the starting point of the non-diagonal edge of the snake
298 const point&
299 begin() const
300 {return begin_;}
301
302 /// Getter for the starting point of the non-diagonal edge of the
303 /// snake, aka begin point.
304 ///
305 ///@param p the new begin point.
306 void
307 begin(const point& p)
308 {begin_ = p;}
309
310 /// Getter for the end point of the non-diagonal edge of the snake.
311 ///
312 /// @return the end point of the non-diagonal edge of the snake
313 const point&
315 {return intermediate_;}
316
317 /// Setter for the end point of the non-diagonal edge of the snake,
318 /// aka intermediate point.
319 ///
320 /// @param p the new intermediate point.
321 void
323 {intermediate_ = p;}
324
325 /// Getter for the end point of the first diagonal edge, aka
326 /// diagonal start point. Note that if the snake has no diagonal
327 /// edge, this point is empty.
328 ///
329 /// @return the end point of the first diagonal edge.
330 const point&
332 {return diagonal_start_;}
333
334 /// Setter for the end point of the first diagonal edge, aka
335 /// diagonal start point.
336 ///
337 /// @param p the new diagonal start.d
338 void
340 {diagonal_start_ = p;}
341
342 /// Getter for the end point of the last diagonal edge, aka snake
343 /// end point. Note that if the snake has no diagonal edge, this
344 /// point is equal to the intermediate point.
345 ///
346 /// @return the end point of the last diagonal edge
347 const point&
348 end() const
349 {return end_;}
350
351 /// Setter for the end point of the last diagonal edge, aka snake
352 /// end point. Note that if the snake has no diagonal edge, this
353 /// point is equal to the intermediate point.
354 void
355 end(const point& p)
356 {end_ = p;}
357
358 /// Setter for the begin, intermediate and end points of the snake.
359 ///
360 /// @param b the new snake begin point
361 ///
362 /// @param i the new snake intermediate point
363 ///
364 /// @param e the new snake end point
365 void
366 set(const point& b, const point&i, const point&e)
367 {
368 begin(b);
369 intermediate(i);
370 end(e);
371 }
372
373 /// Setter for the begin, intermediate, diagonal start and end points
374 /// of the snake.
375 ///
376 /// @param b the new snake begin point
377 ///
378 /// @param i the new snake intermediate point
379 ///
380 /// @param d the new diagonal start point
381 ///
382 /// @param e the new snake end point
383 void
384 set(const point& b, const point&i, const point& d, const point&e)
385 {
386 begin(b);
387 intermediate(i);
389 end(e);
390 }
391
392 /// @return true iff the snake is a forward snake. That is, if it
393 /// was built while walking the edit graph going forward (from the
394 /// top left corner to the right bottom corner.
395 bool
397 {return forward_;}
398
399 /// Set to true if the snake is a forward snake; that is, if it was
400 /// built while walking the edit graph going forward (from the top
401 /// left corner to the right bottom corner. Set to false otherwise.
402 ///
403 /// @param f whether the snake is a forward snake or not.
404 void
406 {forward_ = f;}
407
408 /// Add an offset to the abscissas of the points of the snake, and
409 /// add another offset to the ordinates of these same points.
410 ///
411 /// @param x_offset the offset to add to the abscissas of all the
412 /// points of the snake.
413 ///
414 /// @param y_offset the offset to add to the ordinates of all the
415 /// points of the snake.
416 void
417 add(int x_offset, int y_offset)
418 {
419 if (is_empty())
420 return;
421
422 begin_.add(x_offset, y_offset);
423 intermediate_.add(x_offset, y_offset);
424 if (diagonal_start_)
425 diagonal_start_.add(x_offset, y_offset);
426 end_.add(x_offset, y_offset);
427 }
428
429 /// @return true iff the snake has at least one diagonal edge.
430 bool
432 {return !diagonal_start().is_empty();}
433
434 /// @return true iff the non-diagonal edge is horizontal.
435 bool
437 {return (begin().y() == intermediate().y());}
438
439 /// @return true iff the non-diagonal edge is vertical.
440 bool
442 {return (begin().x() == intermediate().x());}
443
444 /// @return true iff the snake is empty, that is, if all the points
445 /// it contains are empty.
446 bool is_empty() const
447 {return begin().is_empty() && intermediate().is_empty() && end().is_empty();}
448};// end class snake
449
450/// The array containing the furthest D-path end-points, for each value
451/// of K. MAX_D is the maximum value of the D-Path. That is, M+N if
452/// M is the size of the first input string, and N is the size of the
453/// second.
454class d_path_vec : public std::vector<int>
455{
456private:
457
458 unsigned a_size_;
459 unsigned b_size_;
460
461 /// Forbid vector size modifications
462 void
463 push_back(const vector<int>::value_type&);
464
465 /// Forbid default constructor.
466 d_path_vec();
467
468 bool
469 over_bounds(long long index) const
470 {return (index + offset()) >= (long long) size();}
471
472 void
473 check_index_against_bound(int index) const
474 {
475 if (over_bounds(index))
476 {
477 ostringstream o;
478 o << "index '" << index
479 << "' out of range [-" << max_d() << ", " << max_d() << "]";
480 throw std::out_of_range(o.str());
481 }
482 }
483
484public:
485
486 /// Constructor of the d_path_vec.
487 ///
488 /// For forward vectors, the underlying vector allocates 2 *
489 /// [MAX_D+1].
490 /// space, so that one can address elements in the index range
491 /// [-MAX_D, MAX_D]. And MAX_D is the sum of the two sequence
492 /// sizes. delta is the difference.
493 ///
494 /// For reverse vectors, note that we need to be able to address
495 /// [-MAX_D - delta, MAX_D + delta], with delta being the (signed)
496 /// difference between the size of the two sequences. We consider
497 /// delta being bounded by MAX_D itself; so we say we need to be
498 /// able to address [-2MAX_D, 2MAX_D].
499 ///
500 /// @param size1 the size of the first sequence we are interested
501 /// in.
502 ///
503 /// @param size2 the size of the second sequence we are interested
504 /// in.
505 d_path_vec(unsigned size1, unsigned size2)
506 : vector<int>(2 * (size1 + size2 + 1 + (size1 + size2)) + 1, 0),
507 a_size_(size1), b_size_(size2)
508 {
509 }
510
511 std::vector<int>::const_reference
512 operator[](int index) const
513 {return at(index);}
514
515 std::vector<int>::reference
516 operator[](int index)
517 {return at(index);}
518
519 std::vector<int>::reference
520 at(long long index)
521 {
522 //check_index_against_bound(index);
523 long long i = index + offset();
524 return vector<int>::operator[](i);
525 }
526
527 std::vector<int>::const_reference
528 at(long long index) const
529 {
530 check_index_against_bound(index);
531 long long i = offset() + index;
532 return static_cast<const vector<int>* >(this)->at(i);
533 }
534
535 unsigned
536 a_size() const
537 {return a_size_;}
538
539 unsigned
540 b_size() const
541 {return b_size_;}
542
543 unsigned
544 max_d() const
545 {return a_size_ + b_size_;}
546
547 unsigned
548 offset() const
549 {return max_d() + abs((long long) a_size() - (long long) b_size());}
550}; // end class d_path_vec
551
552/// The abstration of an insertion of elements of a sequence B into a
553/// sequence A. This is used to represent the edit script for
554/// transforming a sequence A into a sequence B.
555///
556/// And insertion mainly encapsulates two components:
557///
558/// - An insertion point: this is the index (starting at 0) of the
559/// element of the sequence A after which the insertion occurs.
560///
561/// - Inserted elements: this is a vector of indexes of elements of
562/// sequence B (starting at 0) that got inserted into sequence A,
563/// after the insertion point.
565{
566 int insertion_point_;
567 vector<unsigned> inserted_;
568
569public:
570
571 insertion(int insertion_point,
572 const vector<unsigned>& inserted_indexes)
573 : insertion_point_(insertion_point),
574 inserted_(inserted_indexes)
575 {}
576
577 insertion(int insertion_point = 0)
578 : insertion_point_(insertion_point)
579 {}
580
581 int
582 insertion_point_index() const
583 {return insertion_point_;}
584
585 void
586 insertion_point_index(int i)
587 {insertion_point_ = i;}
588
589 const vector<unsigned>&
590 inserted_indexes() const
591 {return inserted_;}
592
593 vector<unsigned>&
594 inserted_indexes()
595 {return inserted_;}
596};// end class insertion
597
598/// The abstraction of the deletion of one element of a sequence A.
599///
600/// This encapsulates the index of the element A that got deleted.
602{
603 int index_;
604
605public:
606
607 deletion(int i)
608 : index_(i)
609 {}
610
611 int
612 index() const
613 {return index_;}
614
615 void
616 index(int i)
617 {index_ = i;}
618};// end class deletion
619
620/// The abstraction of an edit script for transforming a sequence A
621/// into a sequence B.
622///
623/// It encapsulates the insertions and deletions for transforming A
624/// into B.
626{
627 vector<insertion> insertions_;
628 vector<deletion> deletions_;
629
630public:
631
633 {}
634
635 const vector<insertion>&
636 insertions() const
637 {return insertions_;}
638
639 vector<insertion>&
640 insertions()
641 {return insertions_;}
642
643 const vector<deletion>&
644 deletions() const
645 {return deletions_;}
646
647 vector<deletion>&
648 deletions()
649 {return deletions_;}
650
651 void
652 append(const edit_script& es)
653 {
654 insertions().insert(insertions().end(),
655 es.insertions().begin(),
656 es.insertions().end());
657 deletions().insert(deletions().end(),
658 es.deletions().begin(),
659 es.deletions().end());
660 }
661
662 void
663 prepend(const edit_script& es)
664 {
665 insertions().insert(insertions().begin(),
666 es.insertions().begin(),
667 es.insertions().end());
668 deletions().insert(deletions().begin(),
669 es.deletions().begin(),
670 es.deletions().end());
671 }
672
673 void
674 clear()
675 {
676 insertions().clear();
677 deletions().clear();
678 }
679
680 bool
681 is_empty() const
682 {return insertions().empty() && deletions().empty();}
683
684 operator bool() const
685 {return !is_empty();}
686
687 int
688 num_insertions() const
689 {
690 int l = 0;
691 for (vector<insertion>::const_iterator i = insertions().begin();
692 i != insertions().end();
693 ++i)
694 l += i->inserted_indexes().size();
695 return l;
696 }
697
698 int
699 num_deletions() const
700 {return deletions().size();}
701
702 int
703 length() const
704 {return num_insertions() + num_deletions();}
705};//end class edit_script
706
707bool
709 unsigned a_size,
710 unsigned b_size);
711
712bool
713ends_of_furthest_d_paths_overlap(const point& forward_d_path_end,
714 const point& reverse_d_path_end);
715
716/// The default equality functor used by the core diffing algorithms.
718{
719 /// This equality operator uses the default "==" to compare its
720 /// arguments.
721 ///
722 /// @param a the first comparison argument.
723 ///
724 /// @param b the second comparison argument.
725 ///
726 /// @return true if the two arguments are equal, false otherwise.
727 template<typename T>
728 bool
729 operator()(const T a, const T b) const
730 {return a == b;}
731};
732
733
734/// An equality functor to deeply compare pointers.
736{
737 /// This equality operator compares pointers by comparing the
738 /// pointed-to objects.
739 ///
740 /// @param first the first comparison argument.
741 ///
742 /// @param second the second comparison argument.
743 ///
744 /// @return true if the objects pointed to by the pointers are
745 /// equal, false otherwise.
746 template<typename T>
747 bool
748 operator()(const T* first,
749 const T* second) const
750 {
751 if (!!first != !!second)
752 return false;
753
754 if (!first)
755 return true;
756
757 return *first == *second;
758 }
759
760 /// This equality operator compares pointers by comparing the
761 /// pointed-to objects.
762 ///
763 /// @param first the first comparison argument.
764 ///
765 /// @param second the second comparison argument.
766 ///
767 /// @return true if the objects pointed to by the pointers are
768 /// equal, false otherwise.
769 template<typename T>
770 bool
771 operator()(const shared_ptr<T> first,
772 const shared_ptr<T> second) const
773 {return operator()(first.get(), second.get());}
774
775 /// This equality operator compares pointers by comparing the
776 /// pointed-to objects.
777 ///
778 /// @param first the first comparison argument.
779 ///
780 /// @param second the second comparison argument.
781 ///
782 /// @return true if the objects pointed to by the pointers are
783 /// equal, false otherwise.
784 template<typename T>
785 bool
786 operator()(const weak_ptr<T> first,
787 const weak_ptr<T> second) const
788 {return operator()(shared_ptr<T>(first), shared_ptr<T>(second));}
789}; // end struct deep_ptr_eq_functor
790
791/// Find the end of the furthest reaching d-path on diagonal k, for
792/// two sequences. In the paper This is referred to as "the basic
793/// algorithm".
794///
795/// Unlike in the paper, the coordinates of the edit graph start at
796/// (-1,-1), rather than (0,0), and they end at (M-1, N-1), rather
797/// than (M,N).
798///
799/// @tparm RandomAccessOutputIterator the type of iterators passed to
800/// this function. It must be a random access output iterator kind.
801///
802/// @tparm EqualityFunctor this must be a class that declares a public
803/// call operator member returning a boolean and taking two arguments
804/// that must be of the same type as the one pointed to by the @ref
805/// RandomAccessOutputIterator template parameter. This functor is
806/// used to compare the elements referred to by the iterators passed in
807/// argument to this function.
808///
809/// @param k the number of the diagonal on which we want to find the
810/// end of the furthest reaching D-path.
811///
812/// @param d the D in D-Path. That's the number of insertions/deletions
813/// (the number of changes, in other words) in the changeset. That is
814/// also the number of non-diagonals in the D-Path.
815///
816/// @param a_begin an iterator to the beginning of the first sequence
817///
818/// @param a_end an iterator that points right after the last element
819/// of the second sequence to consider.
820///
821/// @param b_begin an iterator to the beginning of the second sequence.
822///
823/// @param b_end an iterator that points right after the last element
824/// of the second sequence to consider.
825///
826/// @param v the vector of furthest end points of d_paths, at (d-1).
827/// It contains the abscissas of the furthest end points for different
828/// values of k, at (d-1). That is, for k in [-D + 1, -D + 3, -D + 5,
829/// ..., D - 1], v[k] is the abscissa of the end of the furthest
830/// reaching (D-1)-path on diagonal k.
831///
832/// @param snak the last snake of the furthest path found. The end
833/// point of the snake is the end point of the furthest path.
834///
835/// @return true if the end of the furthest reaching path that was
836/// found was inside the boundaries of the edit graph, false
837/// otherwise.
838template<typename RandomAccessOutputIterator,
839 typename EqualityFunctor>
840bool
842 RandomAccessOutputIterator a_begin,
843 RandomAccessOutputIterator a_end,
844 RandomAccessOutputIterator b_start,
845 RandomAccessOutputIterator b_end,
846 d_path_vec& v, snake& snak)
847{
848 int x = -1, y = -1;
849 point begin, intermediate, diag_start, end;
850 snake s;
851 EqualityFunctor eq;
852
853 // Let's pick the end point of the furthest reaching
854 // (D-1)-path. It's either v[k-1] or v[k+1]; the word
855 // "furthest" means we choose the one which abscissa is the
856 // greatest (that is, furthest from abscissa zero).
857 if (k == -d || ((k != d) && (v[k-1] < v[k + 1])))
858 // So, the abscissa of the end point of the furthest
859 // reaching (D-1)-path is v[k+1]. That is a diagonal that
860 // is above the current (k) diagonal, and on the right.
861 // To move to the current k diagonal, one has to move
862 // "down" from the diagonal k+1. So the abscissa won't
863 // change. Only the ordinate will. It will be given by y
864 // = x - k (a bit below); as k has changed from k - 1 (it
865 // has increased), y is going to be the new y that is
866 // 'down' from the previous y in k - 1.
867 {
868 x = v[k+1];
869 begin.set(x, x - (k + 1));
870 }
871 else
872 {
873 // So the abscissa of the end point of the furthest
874 // (D-1)-path is v[k-1]. That is on the left of the
875 // current k diagonal. To move to the current k diagonal,
876 // one has to move "right" from diagonal k - 1. That is,
877 // the y stays constant and x is incremented.
878 x = v[k-1];
879 begin.set(x, x - (k - 1));
880 ++x;
881 }
882
883 // Now get the value of y from the equation k = x -y.
884 // This is the point where we first touch K, when we move
885 // from the end of the furthest reaching (D-1)-path.
886 y = x - k;
887
888 intermediate.x(x);
889 intermediate.y(y);
890
891 int last_x_index = a_end - a_begin - 1;
892 int last_y_index = b_end - b_start - 1;
893 // Now, follow the snake (aka, zero or more consecutive
894 // diagonals). Note that we stay on the k diagonal when we
895 // do this.
896 while ((x < last_x_index) && (y < last_y_index))
897 if (eq(a_begin[x + 1], b_start[y + 1]))
898 {
899 x = x + 1;
900 y = y + 1;
901 if (!diag_start)
902 diag_start.set(x, y);
903 }
904 else
905 break;
906
907 end.x(x);
908 end.y(y);
909
910 // Note the point that we store in v here might be outside the
911 // bounds of the edit graph. But we store it at this step (for a
912 // given D) anyway, because out of bound or not, we need this value
913 // at this step to be able to compute the value of the point on the
914 // "next" diagonal for the next D.
915 v[k] = x;
916
917 if (x >= (int) v.a_size()
918 || y >= (int) v.b_size()
919 || x < -1 || y < -1)
920 return false;
921
922 s.set(begin, intermediate, diag_start, end);
923 s.set_forward(true);
924 snak = s;
925
926 return true;
927}
928
929/// Find the end of the furthest reaching reverse d-path on diagonal k
930/// + delta. Delta is abs(M - N), with M being the size of a and N
931/// being the size of b. This is the "basic algorithm", run backward.
932/// That is, starting from the point (M,N) of the edit graph.
933///
934/// Unlike in the paper, the coordinates of the edit graph start at
935/// (-1,-1), rather than (0,0), and they end at (M-1, N-1), rather
936/// than (M,N).
937///
938/// @tparm RandomAccessOutputIterator the type of iterators passed to
939/// this function. It must be a random access output iterator kind.
940///
941/// @tparm EqualityFunctor this must be a class that declares a public
942/// call operator member returning a boolean and taking two arguments
943/// that must be of the same type as the one pointed to by the @ref
944/// RandomAccessOutputIterator template parameter. This functor is
945/// used to compare the elements referred to by the iterators passed in
946/// argument to this function.
947///
948/// @param k the number of the diagonal on which we want to find the
949/// end of the furthest reaching reverse D-path. Actually, we want to
950/// find the end of the furthest reaching reverse D-path on diagonal (k
951/// - delta).
952///
953/// @param d the D in D-path. That's the number of insertions/deletions
954/// (the number of changes, in other words) in the changeset. That is
955/// also the number of non-diagonals in the D-Path.
956///
957/// @param a_begin an iterator to the beginning of the first sequence
958///
959/// @param a_end an iterator that points right after the last element
960/// of the second sequence to consider.
961///
962/// @param b_begin an iterator to the beginning of the second sequence.
963///
964/// @param b_end an iterator that points right after the last element
965/// of the second sequence to consider.
966///
967/// @param v the vector of furthest end points of d_paths, at (d-1).
968/// It contains the abscissae of the furthest end points for different
969/// values of k - delta, at (d-1). That is, for k in [-D + 1, -D + 3,
970/// -D + 5, ..., D - 1], v[k - delta] is the abscissa of the end of the
971/// furthest reaching (D-1)-path on diagonal k - delta.
972///
973/// @param snak the last snake of the furthest path found. The end
974/// point of the snake is the end point of the furthest path.
975///
976/// @return true iff the end of the furthest reaching path that was
977/// found was inside the boundaries of the edit graph, false
978/// otherwise.
979template<typename RandomAccessOutputIterator,
980 typename EqualityFunctor>
981bool
983 RandomAccessOutputIterator a_begin,
984 RandomAccessOutputIterator a_end,
985 RandomAccessOutputIterator b_begin,
986 RandomAccessOutputIterator b_end,
987 d_path_vec& v, snake& snak)
988{
989 int a_size = a_end - a_begin;
990 int b_size = b_end - b_begin;
991 int delta = a_size - b_size;
992 int k_plus_delta = k + delta;
993 int x = -1, y = -1;
994 point begin, intermediate, diag_start, end;
995 snake s;
996 EqualityFunctor eq;
997
998 // Let's pick the end point of the furthest reaching (D-1)-path and
999 // move from there to reach the current k_plus_delta-line. That end
1000 // point of the furthest reaching (D-1)-path is either on
1001 // v[k_plus_delta-1] or on v[k_plus_delta+1]; the word "furthest"
1002 // means we choose the one which abscissa is the lowest (that is,
1003 // furthest from abscissa M).
1004 if (k_plus_delta == -d + delta
1005 || ((k_plus_delta != d + delta)
1006 && (v[k_plus_delta + 1] <= v[k_plus_delta - 1])))
1007 {
1008 // We move left, that means ordinate won't change ...
1009 x = v[k_plus_delta + 1];
1010 y = x - (k_plus_delta + 1);
1011 begin.set(x, y);
1012 // ... and abscissa decreases.
1013 x = x - 1;
1014 }
1015 else
1016 {
1017 // So the furthest end point is on the k_plus_delta - 1
1018 // diagonal. That is a diagonal that is 'below' the
1019 // k_plus_delta current diagonal. So to join the current
1020 // diagonal from the k_plus_delta - 1 one, we need to move up.
1021
1022 // So moving up means abscissa won't change ...
1023 x = v[k_plus_delta - 1];
1024 begin.set(x, x - (k_plus_delta - 1));
1025 // ... and that ordinate decreases.
1026 y = x - (k_plus_delta - 1) - 1;
1027 }
1028
1029 intermediate.set(x, y);
1030
1031 // Now, follow the snake. Note that we stay on the k_plus_delta
1032 // diagonal when we do this.
1033 while (x >= 0 && y >= 0)
1034 if (eq(a_begin[x], b_begin[y]))
1035 {
1036 if (!diag_start)
1037 diag_start.set(x, y);
1038 x = x - 1;
1039 y = y - 1;
1040 }
1041 else
1042 break;
1043
1044 end.set(x, y);
1045
1046 // Note the point that we store in v here might be outside the
1047 // bounds of the edit graph. But we store it at this step (for a
1048 // given D) anyway, because out of bound or not, we need this value
1049 // at this step to be able to compute the value of the point on the
1050 // "next" diagonal for the next D.
1051 v[k_plus_delta] = x;
1052
1053 if (x == -1 && y == -1)
1054 ;
1055 else if (x < -1 || y < -1)
1056 return false;
1057
1058 s.set(begin, intermediate, diag_start, end);
1059 s.set_forward(false);
1060 snak = s;
1061
1062 return true;
1063}
1064
1065/// Tests if a given point is a match point in an edit graph.
1066///
1067/// @param a_begin the begin iterator of the first input sequence of
1068/// the edit graph.
1069///
1070/// @param a_end the end iterator of the first input sequence of the
1071/// edit graph. This points to one element passed the end of the
1072/// sequence.
1073///
1074/// @param b_begin the begin iterator of the second input sequence of
1075/// the edit graph.
1076///
1077/// @param b_end the end iterator of the second input sequence of the
1078/// edit graph. This points the one element passed the end of the
1079/// sequence.
1080///
1081/// @param point the point to test for being a match point.
1082///
1083/// @return true iff \a point is a match point.
1084template<typename RandomAccessOutputIterator>
1085bool
1086is_match_point(RandomAccessOutputIterator a_begin,
1087 RandomAccessOutputIterator a_end,
1088 RandomAccessOutputIterator b_begin,
1089 RandomAccessOutputIterator b_end,
1090 const point& point)
1091{
1092 int a_size = a_end - a_begin, b_size = b_end - b_begin;
1093
1094 if (point.x() < 0
1095 || point.x () >= a_size
1096 || point.y() < 0
1097 || point.y() >= b_size)
1098 return false;
1099
1100 return (a_begin[point.x()] == b_begin[point.y()]);
1101}
1102
1103/// Returns the middle snake of two sequences A and B, as well as the
1104/// length of their shortest editing script.
1105///
1106/// This uses the "linear space refinement" algorithm presented in
1107/// section 4b in the paper. As the paper says, "The idea for doing
1108/// so is to simultaneously run the basic algorithm in both the
1109/// forward and reverse directions until furthest reaching forward and
1110/// reverse paths starting at opposing corners 'overlap'."
1111///
1112/// @tparm RandomAccessOutputIterator the type of iterators passed to
1113/// this function. It must be a random access output iterator kind.
1114///
1115/// @tparm EqualityFunctor this must be a class that declares a public
1116/// call operator member returning a boolean and taking two arguments
1117/// that must be of the same type as the one pointed to by the @ref
1118/// RandomAccessOutputIterator template parameter. This functor is
1119/// used to compare the elements referred to by the iterators passed in
1120/// argument to this function.
1121///
1122/// @param a_begin an iterator pointing to the begining of sequence A.
1123///
1124/// @param a_end an iterator pointing to the end of sequence A. Note
1125/// that this points right /after/ the end of vector A.
1126///
1127/// @param b_begin an iterator pointing to the begining of sequence B.
1128///
1129/// @param b_end an iterator pointing to the end of sequence B. Note
1130/// that this points right /after/ the end of vector B
1131///
1132/// @param snak out parameter. This is the snake current when the two
1133/// paths overlapped. This is set iff the function returns true;
1134/// otherwise, this is not touched.
1135///
1136/// @return true is the snake was found, false otherwise.
1137template<typename RandomAccessOutputIterator,
1138 typename EqualityFunctor>
1139bool
1140compute_middle_snake(RandomAccessOutputIterator a_begin,
1141 RandomAccessOutputIterator a_end,
1142 RandomAccessOutputIterator b_begin,
1143 RandomAccessOutputIterator b_end,
1144 snake& snak, int& ses_len)
1145{
1146 int a_size = a_end - a_begin;
1147 int N = a_size;
1148 int b_size = b_end - b_begin;
1149 int M = b_size;
1150 int delta = N - M;
1151 d_path_vec forward_d_paths(a_size, b_size);
1152 d_path_vec reverse_d_paths(a_size, b_size);
1153 // These points below are the top leftmost point and bottom
1154 // right-most points of the edit graph.
1155 point first_point(-1, -1), last_point(a_size -1, b_size -1), point_zero(0, 0);
1156
1157 // We want the initial step (D = 0, k = 0 in the paper) to find a
1158 // furthest reaching point on diagonal k == 0; For that, we need the
1159 // value of x for k == 1; So let's set that value to -1; that is for
1160 // k == 1 (diagonal 1), the point in the edit graph is (-1,-2).
1161 // That way, to get the furthest reaching point on diagonal 0 (k ==
1162 // 0), we go down from (-1,-2) on diagonal 1 and we hit diagonal 0
1163 // on (-1,-1); that is the starting value that the algorithm expects
1164 // for k == 0.
1165 forward_d_paths[1] = -1;
1166
1167 // Similarly for the reverse paths, for diagonal delta + 1 (note
1168 // that diagonals are centered on delta, unlike for forward paths
1169 // where they are centered on zero), we set the initial point to
1170 // (a_size, b_size - 1). That way, at step D == 0 and k == delta,
1171 // to reach diagonal delta from the point (a_size, b_size - 1) on
1172 // diagonal delta + 1, we just have to move left, and we hit
1173 // diagonal delta on (a_size - 1, b_size -1); that is the starting
1174 // point value the algorithm expects for k == 0 in the reverse case.
1175 reverse_d_paths[delta + 1] = a_size;
1176
1177 int d_max = (M + N) / 2 + 1;
1178 for (int d = 0; d <= d_max; ++d)
1179 {
1180 // First build forward paths.
1181 for (int k = -d; k <= d; k += 2)
1182 {
1183 snake s;
1184 bool found =
1185 end_of_fr_d_path_in_k<RandomAccessOutputIterator,
1186 EqualityFunctor>(k, d,
1187 a_begin, a_end,
1188 b_begin, b_end,
1189 forward_d_paths, s);
1190 if (!found)
1191 continue;
1192
1193 // As the paper says in 4b while explaining the middle snake
1194 // algorithm:
1195 //
1196 // "Thus when delta is odd, check for overlap only while
1197 // extending forward paths ..."
1198 if ((delta % 2)
1199 && (k >= (delta - (d - 1))) && (k <= (delta + (d - 1))))
1200 {
1201 point reverse_end;
1202 reverse_end.x(reverse_d_paths[k]);
1203 reverse_end.y(reverse_end.x() - k);
1204 if (ends_of_furthest_d_paths_overlap(s.end(), reverse_end))
1205 {
1206 ses_len = 2 * d - 1;
1207 snak = s;
1208 return true;
1209 }
1210 }
1211 }
1212
1213 // Now build reverse paths.
1214 for (int k = -d; k <= d; k += 2)
1215 {
1216 snake s;
1217 bool found =
1218 end_of_frr_d_path_in_k_plus_delta<RandomAccessOutputIterator,
1219 EqualityFunctor>(k, d,
1220 a_begin, a_end,
1221 b_begin, b_end,
1222 reverse_d_paths,
1223 s);
1224
1225 if (!found)
1226 continue;
1227
1228 // And the paper continues by saying:
1229 //
1230 // "... and when delta is even, check for overlap only while
1231 // extending reverse paths."
1232 int k_plus_delta = k + delta;
1233 if (!(delta % 2)
1234 && (k_plus_delta >= -d) && (k_plus_delta <= d))
1235 {
1236 point forward_end;
1237 forward_end.x(forward_d_paths[k_plus_delta]);
1238 forward_end.y(forward_end.x() - k_plus_delta);
1239 if (ends_of_furthest_d_paths_overlap(forward_end, s.end()))
1240 {
1241 ses_len = 2 * d;
1242 snak = s;
1243 return true;
1244 }
1245 }
1246 }
1247 }
1248 return false;
1249}
1250
1251bool
1252compute_middle_snake(const char* str1, const char* str2,
1253 snake& s, int& ses_len);
1254
1255/// This prints the middle snake of two strings.
1256///
1257/// @param a_begin the beginning of the first string.
1258///
1259/// @param b_begin the beginning of the second string.
1260///
1261/// @param s the snake to print.
1262///
1263/// @param out the output stream to print the snake to.
1264template<typename RandomAccessOutputIterator>
1265void
1266print_snake(RandomAccessOutputIterator a_begin,
1267 RandomAccessOutputIterator b_begin,
1268 const snake &s, ostream& out)
1269{
1270 if (s.is_empty())
1271 return;
1272
1273 out << "snake start: ";
1274 out << "(" << s.begin().x() << ", " << s.end().y() << ")\n";
1275
1276 out << "snake intermediate: ";
1277 out << "(" << s.intermediate().x() << ", " << s.intermediate().y() << ")\n";
1278
1279 out << "diagonal point(s): ";
1280 if (s.has_diagonal_edge())
1281 for (int x = s.intermediate().x(), y = s.intermediate().y();
1282 x <= s.end().x() && y <= s.end().y();
1283 ++x, ++y)
1284 {
1285 ABG_ASSERT(a_begin[x] == b_begin[y]);
1286 out << "(" << x << "," << y << ") ";
1287 }
1288 out << "\n";
1289
1290 out << "snake end: ";
1291 out << "(" << s.end().x() << ", " << s.end().y() << ")\n";
1292}
1293
1294/// Compute the length of the shortest edit script for two sequences a
1295/// and b. This is done using the "Greedy LCS/SES" of figure 2 in the
1296/// paper. It can walk the edit graph either foward (when reverse is
1297/// false) or backward starting from the end (when reverse is true).
1298///
1299/// Here, note that the real content of a and b should start at index
1300/// 1, for this implementatikon algorithm to match the paper's
1301/// algorithm in a straightforward manner. So pleast make sure that
1302/// at index 0, we just get some non-used value.
1303///
1304/// @tparm RandomAccessOutputIterator the type of iterators passed to
1305/// this function. It must be a random access output iterator kind.
1306///
1307/// @tparm EqualityFunctor this must be a class that declares a public
1308/// call operator member returning a boolean and taking two arguments
1309/// that must be of the same type as the one pointed to by the @ref
1310/// RandomAccessOutputIterator template parameter. This functor is
1311/// used to compare the elements referred to by the iterators passed in
1312/// argument to this function.
1313///
1314/// @param a the first sequence we care about.
1315///
1316/// @param b the second sequence we care about.
1317///
1318/// @param v the vector that contains the end points of the furthest
1319/// reaching d-path and (d-1)-path.
1320template<typename RandomAccessOutputIterator,
1321 typename EqualityFunctor>
1322int
1323ses_len(RandomAccessOutputIterator a_begin,
1324 RandomAccessOutputIterator a_end,
1325 RandomAccessOutputIterator b_begin,
1326 RandomAccessOutputIterator b_end,
1327 d_path_vec& v, bool reverse)
1328{
1329 unsigned a_size = a_end - a_begin;
1330 unsigned b_size = b_end - b_begin;
1331 snake snak;
1332
1333 ABG_ASSERT(v.max_d() == a_size + b_size);
1334
1335 int delta = a_size - b_size;
1336
1337 if (reverse)
1338 // Set a fictitious (M, N-1) into v[1], to find the furthest
1339 // reaching reverse 0-path (i.e, when we are at d == 0 and k == 0).
1340 v[delta + 1] = a_size - 1;
1341 else
1342 // Set a fictitious (-1,-2) point into v[1], to find the furthest
1343 // reaching forward 0-path (i.e, when we are at d == 0 and k == 0).
1344 v[1] = -1;
1345
1346 for (unsigned d = 0; d <= v.max_d(); ++d)
1347 {
1348 for (int k = -d; k <= (int) d; k += 2)
1349 {
1350 point end;
1351 if (reverse)
1352 {
1353 bool found =
1354 end_of_frr_d_path_in_k_plus_delta<RandomAccessOutputIterator,
1355 EqualityFunctor>(k, d,
1356 a_begin, a_end,
1357 b_begin, b_end,
1358 v, snak);
1359 // If we reached the upper left corner of the edit graph then
1360 // we are done.
1361 if (found && snak.end().x() == -1 && snak.end().y() == -1)
1362 return d;
1363 }
1364 else
1365 {
1366 end_of_fr_d_path_in_k<RandomAccessOutputIterator,
1367 EqualityFunctor>(k, d,
1368 a_begin, a_end,
1369 b_begin, b_end,
1370 v, snak);
1371 // If we reached the lower right corner of the edit
1372 // graph then we are done.
1373 if ((snak.end().x() == (int) a_size - 1)
1374 && (snak.end().y() == (int) b_size - 1))
1375 return d;
1376 }
1377 }
1378 }
1379 return 0;
1380}
1381
1382/// Compute the length of the shortest edit script for two sequences a
1383/// and b. This is done using the "Greedy LCS/SES" of figure 2 in the
1384/// paper. It can walk the edit graph either foward (when reverse is
1385/// false) or backward starting from the end (when reverse is true).
1386///
1387/// Here, note that the real content of a and b should start at index
1388/// 1, for this implementatikon algorithm to match the paper's
1389/// algorithm in a straightforward manner. So pleast make sure that
1390/// at index 0, we just get some non-used value.
1391///
1392/// Note that the equality operator used to compare the elements
1393/// passed in argument to this function is the default "==" operator.
1394///
1395/// @tparm RandomAccessOutputIterator the type of iterators passed to
1396/// this function. It must be a random access output iterator kind.
1397///
1398/// @param a the first sequence we care about.
1399///
1400/// @param b the second sequence we care about.
1401///
1402/// @param v the vector that contains the end points of the furthest
1403/// reaching d-path and (d-1)-path.
1404template<typename RandomAccessOutputIterator>
1405int
1406ses_len(RandomAccessOutputIterator a_begin,
1407 RandomAccessOutputIterator a_end,
1408 RandomAccessOutputIterator b_begin,
1409 RandomAccessOutputIterator b_end,
1410 d_path_vec& v, bool reverse)
1411{
1413 b_begin, b_end,
1414 v, reverse);
1415}
1416
1417int
1418ses_len(const char* str1,
1419 const char* str2,
1420 bool reverse = false);
1421
1422bool
1423snake_end_points(const snake& s, point&, point&);
1424
1425/// Compute the longest common subsequence of two (sub-regions of)
1426/// sequences as well as the shortest edit script from transforming
1427/// the first (sub-region of) sequence into the second (sub-region of)
1428/// sequence.
1429///
1430/// A sequence is determined by a base, a beginning offset and an end
1431/// offset. The base always points to the container that contains the
1432/// sequence to consider. The beginning offset is an iterator that
1433/// points the beginning of the sub-region of the sequence that we
1434/// actually want to consider. The end offset is an iterator that
1435/// points to the end of the sub-region of the sequence that we
1436/// actually want to consider.
1437///
1438/// This uses the LCS algorithm of the paper at section 4b.
1439///
1440/// @tparm RandomAccessOutputIterator the type of iterators passed to
1441/// this function. It must be a random access output iterator kind.
1442///
1443/// @tparm EqualityFunctor this must be a class that declares a public
1444/// call operator member returning a boolean and taking two arguments
1445/// that must be of the same type as the one pointed to by the @ref
1446/// RandomAccessOutputIterator template parameter. This functor is
1447/// used to compare the elements referred to by the iterators passed in
1448/// argument to this function.
1449///
1450/// @param a_base the iterator to the base of the first sequence.
1451///
1452/// @param a_start an iterator to the beginning of the sub-region
1453/// of the first sequence to actually consider.
1454///
1455/// @param a_end an iterator to the end of the sub-region of the first
1456/// sequence to consider.
1457///
1458///@param b_base an iterator to the base of the second sequence to
1459///consider.
1460///
1461/// @param b_start an iterator to the beginning of the sub-region
1462/// of the second sequence to actually consider.
1463///
1464/// @param b_end an iterator to the end of the sub-region of the
1465/// second sequence to actually consider.
1466///
1467/// @param lcs the resulting lcs. This is set iff the function
1468/// returns true.
1469///
1470/// @param ses the resulting shortest editing script.
1471///
1472/// @param ses_len the length of the ses above. Normally this can be
1473/// retrieved from ses.length(), but this parameter is here for sanity
1474/// check purposes. The function computes the length of the ses in
1475/// two redundant ways and ensures that both methods lead to the same
1476/// result.
1477///
1478/// @return true upon successful completion, false otherwise.
1479template<typename RandomAccessOutputIterator,
1480 typename EqualityFunctor>
1481void
1482compute_diff(RandomAccessOutputIterator a_base,
1483 RandomAccessOutputIterator a_begin,
1484 RandomAccessOutputIterator a_end,
1485 RandomAccessOutputIterator b_base,
1486 RandomAccessOutputIterator b_begin,
1487 RandomAccessOutputIterator b_end,
1488 vector<point>& lcs,
1489 edit_script& ses,
1490 int& ses_len)
1491{
1492 int a_size = a_end - a_begin;
1493 int b_size = b_end - b_begin;
1494 unsigned a_offset = a_begin - a_base, b_offset = b_begin - b_base;
1495
1496 if (a_size == 0 || b_size == 0)
1497 {
1498 if (a_size > 0 && b_size == 0)
1499 // All elements of the first sequences have been deleted. So add
1500 // the relevant deletions to the edit script.
1501 for (RandomAccessOutputIterator i = a_begin; i < a_end; ++i)
1502 ses.deletions().push_back(deletion(i - a_base));
1503
1504 if (b_size > 0 && a_size == 0)
1505 {
1506 // All elements present in the second sequence are part of
1507 // an insertion into the first sequence at a_end. So add
1508 // that insertion to the edit script.
1509 int a_full_size = a_end - a_base;
1510 int insertion_index = a_full_size ? a_full_size - 1 : -1;
1511 insertion ins(insertion_index);
1512 for (RandomAccessOutputIterator i = b_begin; i < b_end; ++i)
1513 ins.inserted_indexes().push_back(i - b_base);
1514
1515 ses.insertions().push_back(ins);
1516 }
1517
1518 ses_len = a_size + b_size;
1519 return;
1520 }
1521
1522 int d = 0;
1523 snake snak;
1524 vector<point> trace; // the trace of the edit graph. Read the paper
1525 // to understand what a trace is.
1526 bool has_snake =
1527 compute_middle_snake<RandomAccessOutputIterator,
1528 EqualityFunctor>(a_begin, a_end,
1529 b_begin, b_end,
1530 snak, d);
1531 if (has_snake)
1532 {
1533 // So middle_{begin,end} are expressed wrt a_begin and b_begin.
1534 // Let's express them wrt a_base and b_base.
1535 snak.add(a_offset, b_offset);
1536 ses_len = d;
1537 }
1538
1539 if (has_snake)
1540 {
1541 if ( snak.has_diagonal_edge())
1542 for (int x = snak.diagonal_start().x(), y = snak.diagonal_start().y();
1543 x <= snak.end().x() && y <= snak.end().y();
1544 ++x, ++y)
1545 {
1546 point p(x, y);
1547 trace.push_back(p);
1548 }
1549 }
1550 else
1551 {
1552 // So there is no middle snake. That means there is no lcs, so
1553 // the two sequences are different.
1554
1555 // In other words, all the elements of the first sequence have
1556 // been deleted ...
1557 for (RandomAccessOutputIterator i = a_begin; i < a_end; ++i)
1558 ses.deletions().push_back(deletion(i - a_base));
1559
1560 // ... and all the elements of the second sequence are insertions
1561 // that happen at the beginning of the first sequence.
1562 insertion ins(a_begin - a_base);
1563 for (RandomAccessOutputIterator i = b_begin; i < b_end; ++i)
1564 ins.inserted_indexes().push_back(i - b_base);
1565 ses.insertions().push_back(ins);
1566
1567 ses_len = a_size + b_size;
1568 ABG_ASSERT(ses_len == ses.length());
1569 return;
1570 }
1571
1572 if (d > 1)
1573 {
1574 int tmp_ses_len0 = 0;
1575 edit_script tmp_ses0;
1576 point px, pu;
1577 snake_end_points(snak, px, pu);
1578 compute_diff<RandomAccessOutputIterator,
1579 EqualityFunctor>(a_base, a_begin, a_base + (px.x() + 1),
1580 b_base, b_begin, b_base + (px.y() + 1),
1581 lcs, tmp_ses0, tmp_ses_len0);
1582
1583 lcs.insert(lcs.end(), trace.begin(), trace.end());
1584
1585 int tmp_ses_len1 = 0;
1586 edit_script tmp_ses1;
1587 compute_diff<RandomAccessOutputIterator,
1588 EqualityFunctor>(a_base, a_base + (pu.x() + 1), a_end,
1589 b_base, b_base + (pu.y() + 1), b_end,
1590 lcs, tmp_ses1, tmp_ses_len1);
1591 ABG_ASSERT(tmp_ses0.length() + tmp_ses1.length() == d);
1592 ABG_ASSERT(tmp_ses_len0 + tmp_ses_len1 == d);
1593 ses.append(tmp_ses0);
1594 ses.append(tmp_ses1);
1595 }
1596 else if (d == 1)
1597 {
1598 if (snak.has_diagonal_edge())
1599 {
1600 for (int x = snak.diagonal_start().x(), y = snak.diagonal_start().y();
1601 x <= snak.end().x() && y <= snak.end().y();
1602 ++x, ++y)
1603 {
1604 point p(x, y);
1605 trace.push_back(p);
1606 }
1607 }
1608
1609 if (snak.has_vertical_edge())
1610 {
1611 point p = snak.intermediate();
1612 insertion ins(p.x());
1613 ins.inserted_indexes().push_back(p.y());
1614 ses.insertions().push_back(ins);
1615 }
1616 else if (snak.has_horizontal_edge())
1617 {
1618 if (snak.is_forward())
1619 {
1620 deletion del(snak.intermediate().x());
1621 ses.deletions().push_back(del);
1622 }
1623 else
1624 {
1625 deletion del(snak.begin().x());
1626 ses.deletions().push_back(del);
1627 }
1628 }
1629 }
1630 else if (d == 0)
1631 {
1632 // Obviously on the middle snake is part of the solution, as
1633 // there is no edit script; iow, the two sequences are
1634 // identical.
1635 lcs.insert(lcs.end(), trace.begin(), trace.end());
1636 ses_len = 0;
1637 }
1638
1639 ABG_ASSERT(ses_len == ses.length());
1640}
1641
1642/// Compute the longest common subsequence of two (sub-regions of)
1643/// sequences as well as the shortest edit script from transforming
1644/// the first (sub-region of) sequence into the second (sub-region of)
1645/// sequence.
1646///
1647/// This uses the LCS algorithm of the paper at section 4b.
1648///
1649/// @tparm RandomAccessOutputIterator the type of iterators passed to
1650/// this function. It must be a random access output iterator kind.
1651///
1652/// @tparm EqualityFunctor this must be a class that declares a public
1653/// call operator member returning a boolean and taking two arguments
1654/// that must be of the same type as the one pointed to by the @ref
1655/// RandomAccessOutputIterator template parameter. This functor is
1656/// used to compare the elements referred to by the iterators passed in
1657/// argument to this function.
1658///
1659/// @param a_start an iterator to the beginning of the first sequence
1660/// to consider.
1661///
1662/// @param a_end an iterator to the end of the first sequence to
1663/// consider.
1664///
1665/// @param b_start an iterator to the beginning of the second sequence
1666/// to consider.
1667///
1668/// @param b_end an iterator to the end of the second sequence to
1669/// consider.
1670///
1671/// @param lcs the resulting lcs. This is set iff the function
1672/// returns true.
1673///
1674/// @param ses the resulting shortest editing script.
1675///
1676/// @param ses_len the length of the ses above. Normally this can be
1677/// retrieved from ses.length(), but this parameter is here for sanity
1678/// check purposes. The function computes the length of the ses in
1679/// two redundant ways and ensures that both methods lead to the same
1680/// result.
1681///
1682/// @return true upon successful completion, false otherwise.
1683template<typename RandomAccessOutputIterator,
1684 typename EqualityFunctor>
1685void
1686compute_diff(RandomAccessOutputIterator a_begin,
1687 RandomAccessOutputIterator a_end,
1688 RandomAccessOutputIterator b_begin,
1689 RandomAccessOutputIterator b_end,
1690 vector<point>& lcs,
1691 edit_script& ses,
1692 int& ses_len)
1693{
1694 compute_diff<RandomAccessOutputIterator,
1695 EqualityFunctor>(a_begin, a_begin, a_end,
1696 b_begin, b_begin, b_end,
1697 lcs, ses, ses_len);
1698}
1699
1700/// Compute the longest common subsequence of two (sub-regions of)
1701/// sequences as well as the shortest edit script from transforming
1702/// the first (sub-region of) sequence into the second (sub-region of)
1703/// sequence.
1704///
1705/// A sequence is determined by a base, a beginning offset and an end
1706/// offset. The base always points to the container that contains the
1707/// sequence to consider. The beginning offset is an iterator that
1708/// points the beginning of the sub-region of the sequence that we
1709/// actually want to consider. The end offset is an iterator that
1710/// points to the end of the sub-region of the sequence that we
1711/// actually want to consider.
1712///
1713/// This uses the LCS algorithm of the paper at section 4b.
1714///
1715/// @tparm RandomAccessOutputIterator the type of iterators passed to
1716/// this function. It must be a random access output iterator kind.
1717///
1718/// @tparm EqualityFunctor this must be a class that declares a public
1719/// call operator member returning a boolean and taking two arguments
1720/// that must be of the same type as the one pointed to by the @ref
1721/// RandomAccessOutputIterator template parameter. This functor is
1722/// used to compare the elements referred to by the iterators passed in
1723/// argument to this function.
1724///
1725/// @param a_base the iterator to the base of the first sequence.
1726///
1727/// @param a_start an iterator to the beginning of the sub-region
1728/// of the first sequence to actually consider.
1729///
1730/// @param a_end an iterator to the end of the sub-region of the first
1731/// sequence to consider.
1732///
1733///@param b_base an iterator to the base of the second sequence to
1734///consider.
1735///
1736/// @param b_start an iterator to the beginning of the sub-region
1737/// of the second sequence to actually consider.
1738///
1739/// @param b_end an iterator to the end of the sub-region of the
1740/// second sequence to actually consider.
1741///
1742/// @param lcs the resulting lcs. This is set iff the function
1743/// returns true.
1744///
1745/// @param ses the resulting shortest editing script.
1746///
1747/// @return true upon successful completion, false otherwise.
1748template<typename RandomAccessOutputIterator,
1749 typename EqualityFunctor>
1750void
1751compute_diff(RandomAccessOutputIterator a_base,
1752 RandomAccessOutputIterator a_begin,
1753 RandomAccessOutputIterator a_end,
1754 RandomAccessOutputIterator b_base,
1755 RandomAccessOutputIterator b_begin,
1756 RandomAccessOutputIterator b_end,
1757 vector<point>& lcs,
1758 edit_script& ses)
1759{
1760 int ses_len = 0;
1761
1762 compute_diff<RandomAccessOutputIterator,
1763 EqualityFunctor>(a_base, a_begin, a_end,
1764 b_base, b_begin, b_end,
1765 lcs, ses, ses_len);
1766}
1767
1768/// Compute the longest common subsequence of two (sub-regions of)
1769/// sequences as well as the shortest edit script from transforming
1770/// the first (sub-region of) sequence into the second (sub-region of)
1771/// sequence.
1772///
1773/// This uses the LCS algorithm of the paper at section 4b.
1774///
1775/// @tparm RandomAccessOutputIterator the type of iterators passed to
1776/// this function. It must be a random access output iterator kind.
1777///
1778/// @tparm EqualityFunctor this must be a class that declares a public
1779/// call operator member returning a boolean and taking two arguments
1780/// that must be of the same type as the one pointed to by the @ref
1781/// RandomAccessOutputIterator template parameter. This functor is
1782/// used to compare the elements referred to by the iterators passed in
1783/// argument to this function.
1784///
1785/// @param a_start an iterator to the beginning of the first sequence
1786/// to consider.
1787///
1788/// @param a_end an iterator to the end of the first sequence to
1789/// consider.
1790///
1791/// @param b_start an iterator to the beginning of the sequence to
1792/// actually consider.
1793///
1794/// @param b_end an iterator to the end of second sequence to
1795/// consider.
1796///
1797/// @param lcs the resulting lcs. This is set iff the function
1798/// returns true.
1799///
1800/// @param ses the resulting shortest editing script.
1801///
1802/// @return true upon successful completion, false otherwise.
1803template<typename RandomAccessOutputIterator,
1804 typename EqualityFunctor>
1805void
1806compute_diff(RandomAccessOutputIterator a_begin,
1807 RandomAccessOutputIterator a_end,
1808 RandomAccessOutputIterator b_begin,
1809 RandomAccessOutputIterator b_end,
1810 vector<point>& lcs,
1811 edit_script& ses)
1812{
1813 compute_diff<RandomAccessOutputIterator,
1814 EqualityFunctor>(a_begin, a_begin, a_end,
1815 b_begin, b_begin, b_end,
1816 lcs, ses);
1817}
1818
1819/// Compute the longest common subsequence of two (sub-regions of)
1820/// sequences as well as the shortest edit script from transforming
1821/// the first (sub-region of) sequence into the second (sub-region of)
1822/// sequence.
1823///
1824/// This uses the LCS algorithm of the paper at section 4b.
1825///
1826/// @tparm RandomAccessOutputIterator the type of iterators passed to
1827/// this function. It must be a random access output iterator kind.
1828///
1829/// @param a_start an iterator to the beginning of the first sequence
1830/// to consider.
1831///
1832/// @param a_end an iterator to the end of the first sequence to
1833/// consider.
1834///
1835/// @param b_start an iterator to the beginning of the sequence to
1836/// actually consider.
1837///
1838/// @param b_end an iterator to the end of second sequence to
1839/// consider.
1840///
1841/// @param lcs the resulting lcs. This is set iff the function
1842/// returns true.
1843///
1844/// @param ses the resulting shortest editing script.
1845///
1846/// @return true upon successful completion, false otherwise.
1847template<typename RandomAccessOutputIterator>
1848void
1849compute_diff(RandomAccessOutputIterator a_begin,
1850 RandomAccessOutputIterator a_end,
1851 RandomAccessOutputIterator b_begin,
1852 RandomAccessOutputIterator b_end,
1853 vector<point>& lcs,
1854 edit_script& ses)
1855{
1856 compute_diff<RandomAccessOutputIterator,
1857 default_eq_functor>(a_begin, a_end, b_begin, b_end, lcs, ses);
1858}
1859
1860/// Compute the longest common subsequence of two (sub-regions of)
1861/// sequences as well as the shortest edit script from transforming
1862/// the first (sub-region of) sequence into the second (sub-region of)
1863/// sequence.
1864///
1865/// A sequence is determined by a base, a beginning offset and an end
1866/// offset. The base always points to the container that contains the
1867/// sequence to consider. The beginning offset is an iterator that
1868/// points the beginning of the sub-region of the sequence that we
1869/// actually want to consider. The end offset is an iterator that
1870/// points to the end of the sub-region of the sequence that we
1871/// actually want to consider.
1872///
1873/// This uses the LCS algorithm of the paper at section 4b.
1874///
1875/// @tparm RandomAccessOutputIterator the type of iterators passed to
1876/// this function. It must be a random access output iterator kind.
1877///
1878/// @tparm EqualityFunctor this must be a class that declares a public
1879/// call operator member returning a boolean and taking two arguments
1880/// that must be of the same type as the one pointed to by the @ref
1881/// RandomAccessOutputIterator template parameter. This functor is
1882/// used to compare the elements referred to by the iterators passed in
1883/// argument to this function.
1884///
1885/// @param a_base the iterator to the base of the first sequence.
1886///
1887/// @param a_start an iterator to the beginning of the sub-region
1888/// of the first sequence to actually consider.
1889///
1890/// @param a_end an iterator to the end of the sub-region of the first
1891/// sequence to consider.
1892///
1893/// @param b_base an iterator to the base of the second sequence to
1894/// consider.
1895///
1896/// @param b_start an iterator to the beginning of the sub-region
1897/// of the second sequence to actually consider.
1898///
1899/// @param b_end an iterator to the end of the sub-region of the
1900/// second sequence to actually consider.
1901///
1902/// @param ses the resulting shortest editing script.
1903///
1904/// @return true upon successful completion, false otherwise.
1905template<typename RandomAccessOutputIterator,
1906 typename EqualityFunctor>
1907void
1908compute_diff(RandomAccessOutputIterator a_base,
1909 RandomAccessOutputIterator a_begin,
1910 RandomAccessOutputIterator a_end,
1911 RandomAccessOutputIterator b_base,
1912 RandomAccessOutputIterator b_begin,
1913 RandomAccessOutputIterator b_end,
1914 edit_script& ses)
1915{
1916 vector<point> lcs;
1917
1918 compute_diff<RandomAccessOutputIterator,
1919 EqualityFunctor>(a_base, a_begin, a_end,
1920 b_base, b_begin, b_end,
1921 lcs, ses);
1922}
1923
1924/// Compute the longest common subsequence of two (sub-regions of)
1925/// sequences as well as the shortest edit script from transforming
1926/// the first (sub-region of) sequence into the second (sub-region of)
1927/// sequence.
1928///
1929/// This uses the LCS algorithm of the paper at section 4b.
1930///
1931/// @tparm RandomAccessOutputIterator the type of iterators passed to
1932/// this function. It must be a random access output iterator kind.
1933///
1934/// @tparm EqualityFunctor this must be a class that declares a public
1935/// call operator member returning a boolean and taking two arguments
1936/// that must be of the same type as the one pointed to by the @ref
1937/// RandomAccessOutputIterator template parameter. This functor is
1938/// used to compare the elements referred to by the iterators passed in
1939/// argument to this function.
1940///
1941/// @param a_start an iterator to the beginning of the first sequence
1942/// to consider.
1943///
1944/// @param a_end an iterator to the end of the first sequence to
1945/// consider.
1946///
1947/// @param b_start an iterator to the beginning of the second sequence
1948/// to consider.
1949///
1950/// @param b_end an iterator to the end of the second sequence to
1951/// consider.
1952///
1953/// @param ses the resulting shortest editing script.
1954///
1955/// @return true upon successful completion, false otherwise.
1956template<typename RandomAccessOutputIterator,
1957 typename EqualityFunctor>
1958void
1959compute_diff(RandomAccessOutputIterator a_begin,
1960 RandomAccessOutputIterator a_end,
1961 RandomAccessOutputIterator b_begin,
1962 RandomAccessOutputIterator b_end,
1963 edit_script& ses)
1964{
1965 compute_diff<RandomAccessOutputIterator,
1966 EqualityFunctor>(a_begin, a_begin, a_end,
1967 b_begin, b_begin, b_end,
1968 ses);
1969}
1970
1971/// Compute the longest common subsequence of two (sub-regions of)
1972/// sequences as well as the shortest edit script from transforming
1973/// the first (sub-region of) sequence into the second (sub-region of)
1974/// sequence.
1975///
1976/// This uses the LCS algorithm of the paper at section 4b.
1977///
1978/// @tparm RandomAccessOutputIterator the type of iterators passed to
1979/// this function. It must be a random access output iterator kind.
1980///
1981/// @param a_start an iterator to the beginning of the first sequence
1982/// to consider.
1983///
1984/// @param a_end an iterator to the end of the first sequence to
1985/// consider.
1986///
1987/// @param b_start an iterator to the beginning of the second sequence
1988/// to consider.
1989///
1990/// @param b_end an iterator to the end of the second sequence to
1991/// consider.
1992///
1993/// @param ses the resulting shortest editing script.
1994///
1995/// @return true upon successful completion, false otherwise.
1996template<typename RandomAccessOutputIterator>
1997void
1998compute_diff(RandomAccessOutputIterator a_begin,
1999 RandomAccessOutputIterator a_end,
2000 RandomAccessOutputIterator b_begin,
2001 RandomAccessOutputIterator b_end,
2002 edit_script& ses)
2003{
2005 b_begin, b_end,
2006 ses);
2007}
2008
2009void
2010compute_lcs(const char* str1, const char* str2, int &ses_len, string& lcs);
2011
2012void
2013compute_ses(const char* str1, const char* str2, edit_script& ses);
2014
2015/// Display an edit script on standard output.
2016///
2017/// @param es the edit script to display
2018///
2019/// @param str1_base the first string the edit script is about.
2020///
2021/// @pram str2_base the second string the edit script is about.
2022template<typename RandomAccessOutputIterator>
2023void
2025 const RandomAccessOutputIterator str1_base,
2026 const RandomAccessOutputIterator str2_base,
2027 ostream& out)
2028{
2029 if (es.num_deletions() == 0)
2030 out << "no deletion:\n";
2031 else if (es.num_deletions() == 1)
2032 {
2033 out << "1 deletion:\n"
2034 << "\t happened at index: ";
2035 }
2036 else
2037 {
2038 out << es.num_deletions() << " deletions:\n"
2039 << "\t happened at indexes: ";
2040 }
2041
2042 for (vector<deletion>::const_iterator i = es.deletions().begin();
2043 i != es.deletions().end();
2044 ++i)
2045 {
2046 if (i != es.deletions().begin())
2047 out << ", ";
2048 out << i->index() << " (" << str1_base[i->index()] << ")";
2049 }
2050 out << "\n\n";
2051
2052 if (es.num_insertions() == 0)
2053 out << "no insertion\n";
2054 else if (es.num_insertions() == 1)
2055 out << "1 insertion\n";
2056 else
2057 out << es.num_insertions() << " insertions:\n";
2058 for (vector<insertion>::const_iterator i = es.insertions().begin();
2059 i != es.insertions().end();
2060 ++i)
2061 {
2062 int idx = i->insertion_point_index();
2063 if (idx < 0)
2064 out << "\t before index of first sequence: " << idx + 1
2065 << " (" << str1_base[idx + 1] << ")\n";
2066 else
2067 out << "\t after index of first sequence: " << idx
2068 << " (" << str1_base[idx] << ")\n";
2069
2070 if (!i->inserted_indexes().empty())
2071 out << "\t\t inserted indexes from second sequence: ";
2072
2073 for (vector<unsigned>::const_iterator j = i->inserted_indexes().begin();
2074 j != i->inserted_indexes().end();
2075 ++j)
2076 {
2077 if (j != i->inserted_indexes().begin())
2078 out << ", ";
2079 out << *j << " (" << str2_base[*j] << ")";
2080 }
2081 out << "\n";
2082 }
2083 out << "\n\n";
2084}
2085
2086}//end namespace diff_utils
2087
2088}//end namespace abigail
2089#endif // __ABG_DIFF_UTILS_H__
#define ABG_ASSERT(cond)
This is a wrapper around the 'assert' glibc call. It allows for its argument to have side effects,...
Definition abg-fwd.h:1718
The array containing the furthest D-path end-points, for each value of K. MAX_D is the maximum value ...
d_path_vec(unsigned size1, unsigned size2)
Constructor of the d_path_vec.
The abstraction of the deletion of one element of a sequence A.
The abstraction of an edit script for transforming a sequence A into a sequence B.
The abstration of an insertion of elements of a sequence B into a sequence A. This is used to represe...
A class representing a vertex in an edit graph, as explained in the paper. A vertex is a basically a ...
The abstraction of the Snake concept, from the paper.
snake()
Default constructor for snake.
const point & intermediate() const
Getter for the end point of the non-diagonal edge of the snake.
void intermediate(const point &p)
Setter for the end point of the non-diagonal edge of the snake, aka intermediate point.
void end(const point &p)
Setter for the end point of the last diagonal edge, aka snake end point. Note that if the snake has n...
void begin(const point &p)
Getter for the starting point of the non-diagonal edge of the snake, aka begin point.
const point & diagonal_start() const
Getter for the end point of the first diagonal edge, aka diagonal start point. Note that if the snake...
const point & end() const
Getter for the end point of the last diagonal edge, aka snake end point. Note that if the snake has n...
snake(const point &b, const point &i, const point &e)
Constructor from the beginning, intermediate and end points.
void diagonal_start(const point &p)
Setter for the end point of the first diagonal edge, aka diagonal start point.
void set_forward(bool f)
Set to true if the snake is a forward snake; that is, if it was built while walking the edit graph go...
void add(int x_offset, int y_offset)
Add an offset to the abscissas of the points of the snake, and add another offset to the ordinates of...
void set(const point &b, const point &i, const point &d, const point &e)
Setter for the begin, intermediate, diagonal start and end points of the snake.
void set(const point &b, const point &i, const point &e)
Setter for the begin, intermediate and end points of the snake.
snake(const point &b, const point &i, const point &d, const point &e)
Constructor from the beginning, intermediate and end points.
const point & begin() const
Getter for the starting point of the non-diagonal edge of the snake.
bool is_match_point(RandomAccessOutputIterator a_begin, RandomAccessOutputIterator a_end, RandomAccessOutputIterator b_begin, RandomAccessOutputIterator b_end, const point &point)
Tests if a given point is a match point in an edit graph.
bool compute_middle_snake(const char *str1, const char *str2, snake &s, int &ses_len)
Returns the middle snake of two strings, as well as the length of their shortest editing script.
bool snake_end_points(const snake &s, point &x, point &u)
Get the end points of the snake, as intended by the paper.
void compute_diff(RandomAccessOutputIterator a_base, RandomAccessOutputIterator a_begin, RandomAccessOutputIterator a_end, RandomAccessOutputIterator b_base, RandomAccessOutputIterator b_begin, RandomAccessOutputIterator b_end, vector< point > &lcs, edit_script &ses, int &ses_len)
Compute the longest common subsequence of two (sub-regions of) sequences as well as the shortest edit...
bool ends_of_furthest_d_paths_overlap(const point &forward_d_path_end, const point &reverse_d_path_end)
void display_edit_script(const edit_script &es, const RandomAccessOutputIterator str1_base, const RandomAccessOutputIterator str2_base, ostream &out)
Display an edit script on standard output.
bool end_of_frr_d_path_in_k_plus_delta(int k, int d, RandomAccessOutputIterator a_begin, RandomAccessOutputIterator a_end, RandomAccessOutputIterator b_begin, RandomAccessOutputIterator b_end, d_path_vec &v, snake &snak)
Find the end of the furthest reaching reverse d-path on diagonal k + delta. Delta is abs(M - N),...
void compute_lcs(const char *str1, const char *str2, int &ses_len, string &lcs)
Compute the longest common subsequence of two strings and return the length of the shortest edit scri...
void print_snake(RandomAccessOutputIterator a_begin, RandomAccessOutputIterator b_begin, const snake &s, ostream &out)
This prints the middle snake of two strings.
void compute_ses(const char *str1, const char *str2, edit_script &ses)
Compute the shortest edit script for transforming a string into another.
int ses_len(const char *str1, const char *str2, bool reverse)
Compute the length of the shortest edit script for two strings. This is done using the "Greedy LCS/SE...
bool point_is_valid_in_graph(point &p, unsigned a_size, unsigned b_size)
bool end_of_fr_d_path_in_k(int k, int d, RandomAccessOutputIterator a_begin, RandomAccessOutputIterator a_end, RandomAccessOutputIterator b_start, RandomAccessOutputIterator b_end, d_path_vec &v, snake &snak)
Find the end of the furthest reaching d-path on diagonal k, for two sequences. In the paper This is r...
Toplevel namespace for libabigail.
An equality functor to deeply compare pointers.
bool operator()(const shared_ptr< T > first, const shared_ptr< T > second) const
This equality operator compares pointers by comparing the pointed-to objects.
bool operator()(const T *first, const T *second) const
This equality operator compares pointers by comparing the pointed-to objects.
bool operator()(const weak_ptr< T > first, const weak_ptr< T > second) const
This equality operator compares pointers by comparing the pointed-to objects.
The default equality functor used by the core diffing algorithms.
bool operator()(const T a, const T b) const
This equality operator uses the default "==" to compare its arguments.