This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH 6/7] stdlib: Optimization qsort{_r} swap implementation
- From: Adhemerval Zanella <adhemerval dot zanella at linaro dot org>
- To: libc-alpha at sourceware dot org
- Date: Thu, 18 Jan 2018 15:53:21 -0200
- Subject: [PATCH 6/7] stdlib: Optimization qsort{_r} swap implementation
- Authentication-results: sourceware.org; auth=none
- References: <1516298002-4618-1-git-send-email-adhemerval.zanella@linaro.org>
This patchs adds a optimized swap operation on qsort based in previous
msort one. Instead of byte operation, three variants are provided:
1. Using uint32_t loads and stores.
2. Using uint64_t loads and stores.
3. Generic one with a temporary buffer and memcpy/mempcpy.
The 1. and 2. option are selected only either if architecture defines
_STRING_ARCH_unaligned or if base pointer is aligned to required type.
This is due based on data for bench-qsort, usually programs calls
qsort with array with multiple of machine word as element size.
Benchmarking shows an increase performance:
Results for member size 4
Sorted
nmemb | base | patched | diff
32| 1401 | 1958 | 39.76
4096| 351333 | 368533 | 4.90
32768| 3369386 | 3131712 | -7.05
524288| 63192972 | 59807494 | -5.36
MostlySorted
nmemb | base | patched | diff
32| 2391 | 2061 | -13.80
4096| 1124074 | 961816 | -14.43
32768| 11196607 | 9410438 | -15.95
524288| 215908169 | 185586732 | -14.04
Unsorted
nmemb | base | patched | diff
32| 4993 | 2021 | -59.52
4096| 1113860 | 963126 | -13.53
32768| 11251293 | 9518795 | -15.40
524288| 217252237 | 185072278 | -14.81
Results for member size 8
Sorted
nmemb | base | patched | diff
32| 1296 | 1267 | -2.24
4096| 359418 | 334852 | -6.83
32768| 3535229 | 3345157 | -5.38
524288| 69847251 | 67029358 | -4.03
MostlySorted
nmemb | base | patched | diff
32| 2745 | 2340 | -14.75
4096| 1222082 | 1014314 | -17.00
32768| 12244800 | 9924706 | -18.95
524288| 241557971 | 196898760 | -18.49
Unsorted
nmemb | base | patched | diff
32| 2972 | 2389 | -19.62
4096| 1314861 | 1024052 | -22.12
32768| 12397909 | 10120848 | -18.37
524288| 241789262 | 193414824 | -20.01
Results for member size 32
Sorted
nmemb | base | patched | diff
32| 1305 | 1287 | -1.38
4096| 346332 | 347979 | 0.48
32768| 3458244 | 3408058 | -1.45
524288| 72793445 | 69973719 | -3.87
MostlySorted
nmemb | base | patched | diff
32| 5435 | 4890 | -10.03
4096| 2032260 | 1688556 | -16.91
32768| 19909035 | 16419992 | -17.52
524288| 390339319 | 325921585 | -16.50
Unsorted
nmemb | base | patched | diff
32| 5833 | 5351 | -8.26
4096| 2022531 | 1724961 | -14.71
32768| 19842888 | 16588545 | -16.40
524288| 388838382 | 324102703 | -16.65
Checked on x86_64-linux-gnu.
[BZ #19305].
* stdlib/qsort.c (SWAP): Remove.
(check_alignment, swap_u32, swap_u64, swap_generic,
select_swap_func): New functions.
(__qsort_r):
---
stdlib/qsort.c | 77 ++++++++++++++++++++++++++++++++++++++++++++--------------
1 file changed, 59 insertions(+), 18 deletions(-)
diff --git a/stdlib/qsort.c b/stdlib/qsort.c
index b3a5102..2194003 100644
--- a/stdlib/qsort.c
+++ b/stdlib/qsort.c
@@ -23,20 +23,59 @@
#include <limits.h>
#include <stdlib.h>
#include <string.h>
+#include <stdbool.h>
-/* Byte-wise swap two items of size SIZE. */
-#define SWAP(a, b, size) \
- do \
- { \
- size_t __size = (size); \
- char *__a = (a), *__b = (b); \
- do \
- { \
- char __tmp = *__a; \
- *__a++ = *__b; \
- *__b++ = __tmp; \
- } while (--__size > 0); \
- } while (0)
+/* Swap SIZE bytes between addresses A and B. Helper to generic types
+ are provided as an optimization. */
+
+typedef void (*swap_t)(void *, void *, size_t);
+
+static inline bool
+check_alignment (const void *base, size_t align)
+{
+ return _STRING_ARCH_unaligned || ((uintptr_t)base % (align - 1)) == 0;
+}
+
+static void
+swap_u32 (void *a, void *b, size_t size)
+{
+ uint32_t tmp = *(uint32_t*) a;
+ *(uint32_t*) a = *(uint32_t*) b;
+ *(uint32_t*) b = tmp;
+}
+
+static void
+swap_u64 (void *a, void *b, size_t size)
+{
+ uint64_t tmp = *(uint64_t*) a;
+ *(uint64_t*) a = *(uint64_t*) b;
+ *(uint64_t*) b = tmp;
+}
+
+static inline void
+swap_generic (void *a, void *b, size_t size)
+{
+ unsigned char tmp[128];
+ do
+ {
+ size_t s = size > sizeof (tmp) ? sizeof (tmp) : size;
+ memcpy (tmp, a, s);
+ a = __mempcpy (a, b, s);
+ b = __mempcpy (b, tmp, s);
+ size -= s;
+ }
+ while (size > 0);
+}
+
+static inline swap_t
+select_swap_func (const void *base, size_t size)
+{
+ if (size == 4 && check_alignment (base, 4))
+ return swap_u32;
+ else if (size == 8 && check_alignment (base, 8))
+ return swap_u64;
+ return swap_generic;
+}
/* Discontinue quicksort algorithm when partition gets below this size.
This particular magic number was chosen to work best on a Sun 4/260. */
@@ -96,6 +135,8 @@ __qsort_r (void *const pbase, size_t total_elems, size_t size,
/* Avoid lossage with unsigned arithmetic below. */
return;
+ swap_t swap = select_swap_func (pbase, size);
+
if (total_elems > MAX_THRESH)
{
char *lo = base_ptr;
@@ -119,13 +160,13 @@ __qsort_r (void *const pbase, size_t total_elems, size_t size,
char *mid = lo + size * ((hi - lo) / size >> 1);
if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
- SWAP (mid, lo, size);
+ swap (mid, lo, size);
if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
- SWAP (mid, hi, size);
+ swap (mid, hi, size);
else
goto jump_over;
if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
- SWAP (mid, lo, size);
+ swap (mid, lo, size);
jump_over:;
left_ptr = lo + size;
@@ -144,7 +185,7 @@ __qsort_r (void *const pbase, size_t total_elems, size_t size,
if (left_ptr < right_ptr)
{
- SWAP (left_ptr, right_ptr, size);
+ swap (left_ptr, right_ptr, size);
if (mid == left_ptr)
mid = right_ptr;
else if (mid == right_ptr)
@@ -216,7 +257,7 @@ __qsort_r (void *const pbase, size_t total_elems, size_t size,
tmp_ptr = run_ptr;
if (tmp_ptr != base_ptr)
- SWAP (tmp_ptr, base_ptr, size);
+ swap (tmp_ptr, base_ptr, size);
/* Insertion sort, running from left-hand-side up to right-hand-side. */
--
2.7.4