This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH v2 6/7] stdlib: Optimization qsort{_r} swap implementation
- From: Adhemerval Zanella <adhemerval dot zanella at linaro dot org>
- To: libc-alpha at sourceware dot org
- Date: Fri, 31 Aug 2018 17:42:37 -0300
- Subject: [PATCH v2 6/7] stdlib: Optimization qsort{_r} swap implementation
- References: <20180831204238.10626-1-adhemerval.zanella@linaro.org>
This patchs adds a optimized swap operation on qsort based in previous
msort one. Instead of byte operation, three variants are provided:
1. Using uint32_t loads and stores.
2. Using uint64_t loads and stores.
3. Generic one with a temporary buffer and memcpy/mempcpy.
The 1. and 2. option are selected only either if architecture defines
_STRING_ARCH_unaligned or if base pointer is aligned to required type.
This is due based on data for bench-qsort, usually programs calls
qsort with array with multiple of machine word as element size.
Benchmarking shows an increase performance:
Results for member size 4
MostlySorted
nmemb | base | patched | diff
32 | 2145 | 2012 | -6.20
4096 | 902724 | 789329 | -12.56
32768 | 7844403 | 7871853 | 0.35
524288 | 152809379 | 151037732 | -1.16
Repeated
nmemb | base | patched | diff
32 | 2222 | 2017 | -9.23
4096 | 1029244 | 948784 | -7.82
32768 | 10057897 | 9242215 | -8.11
524288 | 197150076 | 182508235 | -7.43
Sorted
nmemb | base | patched | diff
32 | 1461 | 1332 | -8.83
4096 | 357884 | 351388 | -1.82
32768 | 3468080 | 3556735 | 2.56
524288 | 67584959 | 67278267 | -0.45
Unsorted
nmemb | base | patched | diff
32 | 2385 | 2009 | -15.77
4096 | 1146892 | 1039794 | -9.34
32768 | 10799397 | 10116859 | -6.32
524288 | 212098446 | 198713626 | -6.31
Results for member size 8
MostlySorted
nmemb | base | patched | diff
32 | 2676 | 1965 | -26.57
4096 | 907769 | 762125 | -16.04
32768 | 8605499 | 7017240 | -18.46
524288 | 154255341 | 129564606 | -16.01
Repeated
nmemb | base | patched | diff
32 | 2230 | 1998 | -10.40
4096 | 1129157 | 970507 | -14.05
32768 | 10775229 | 9173248 | -14.87
524288 | 212935649 | 179637006 | -15.64
Sorted
nmemb | base | patched | diff
32 | 1193 | 1205 | 1.01
4096 | 308152 | 308174 | 0.01
32768 | 3022480 | 3018157 | -0.14
524288 | 60029109 | 59608087 | -0.70
Unsorted
nmemb | base | patched | diff
32 | 2814 | 2575 | -8.49
4096 | 1198160 | 1040231 | -13.18
32768 | 11678920 | 10160881 | -13.00
524288 | 229161344 | 197305501 | -13.90
Results for member size 32
MostlySorted
nmemb | base | patched | diff
32 | 5073 | 4474 | -11.81
4096 | 1572437 | 1185956 | -24.58
32768 | 14170732 | 10710788 | -24.42
524288 | 267001863 | 196553161 | -26.39
Repeated
nmemb | base | patched | diff
32 | 5592 | 4670 | -16.49
4096 | 1890849 | 1351979 | -28.50
32768 | 18284219 | 12917614 | -29.35
524288 | 361847282 | 258020738 | -28.69
Sorted
nmemb | base | patched | diff
32 | 1179 | 1221 | 3.56
4096 | 308793 | 308146 | -0.21
32768 | 3017486 | 3120670 | 3.42
524288 | 63986145 | 64995508 | 1.58
Unsorted
nmemb | base | patched | diff
32 | 6591 | 3975 | -39.69
4096 | 1990681 | 1470465 | -26.13
32768 | 19127569 | 14109425 | -26.24
524288 | 375072780 | 276687595 | -26.23
Checked on x86_64-linux-gnu.
[BZ #19305].
* stdlib/qsort.c (SWAP): Remove.
(check_alignment, swap_u32, swap_u64, swap_generic,
select_swap_func): New functions.
(__qsort_r):
---
stdlib/qsort.c | 83 +++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 65 insertions(+), 18 deletions(-)
diff --git a/stdlib/qsort.c b/stdlib/qsort.c
index b3a5102cac..c3fb0e862f 100644
--- a/stdlib/qsort.c
+++ b/stdlib/qsort.c
@@ -23,20 +23,65 @@
#include <limits.h>
#include <stdlib.h>
#include <string.h>
+#include <stdbool.h>
-/* Byte-wise swap two items of size SIZE. */
-#define SWAP(a, b, size) \
- do \
- { \
- size_t __size = (size); \
- char *__a = (a), *__b = (b); \
- do \
- { \
- char __tmp = *__a; \
- *__a++ = *__b; \
- *__b++ = __tmp; \
- } while (--__size > 0); \
- } while (0)
+/* Swap SIZE bytes between addresses A and B. Helper to generic types
+ are provided as an optimization. */
+
+typedef void (*swap_t)(void *, void *, size_t);
+
+static inline bool
+check_alignment (const void *base, size_t align)
+{
+ return _STRING_ARCH_unaligned || ((uintptr_t)base & (align - 1)) == 0;
+}
+
+static void
+swap_u32 (void * restrict a, void * restrict b, size_t size)
+{
+ uint32_t *ua = a, *ub = b, tmp = *ua;
+ *ua = *ub, *ub = tmp;
+}
+
+static void
+swap_u64 (void * restrict a, void * restrict b, size_t size)
+{
+ uint64_t *ua = a, *ub = b, tmp = *ua;
+ *ua = *ub, *ub = tmp;
+}
+
+static void
+swap_generic (void * restrict a, void * restrict b, size_t size)
+{
+ /* Use multiple small memcpys with constant size to enable inlining
+ on most targets. */
+ enum {
+ SWAP_GENERIC_SIZE = 32
+ };
+ unsigned char tmp[SWAP_GENERIC_SIZE];
+ while (size > SWAP_GENERIC_SIZE)
+ {
+ memcpy (tmp, a, SWAP_GENERIC_SIZE);
+ a = memcpy (a, b, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
+ b = memcpy (b, tmp, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
+ size -= SWAP_GENERIC_SIZE;
+ }
+ memcpy (tmp, a, size);
+ memcpy (a, b, size);
+ memcpy (b, tmp, size);
+}
+
+static inline swap_t
+select_swap_func (const void *base, size_t size)
+{
+ if (size == sizeof (uint32_t)
+ && check_alignment (base, _Alignof (uint32_t)))
+ return swap_u32;
+ else if (size == sizeof (uint64_t)
+ && check_alignment (base, _Alignof (uint64_t)))
+ return swap_u64;
+ return swap_generic;
+}
/* Discontinue quicksort algorithm when partition gets below this size.
This particular magic number was chosen to work best on a Sun 4/260. */
@@ -96,6 +141,8 @@ __qsort_r (void *const pbase, size_t total_elems, size_t size,
/* Avoid lossage with unsigned arithmetic below. */
return;
+ swap_t swap = select_swap_func (pbase, size);
+
if (total_elems > MAX_THRESH)
{
char *lo = base_ptr;
@@ -119,13 +166,13 @@ __qsort_r (void *const pbase, size_t total_elems, size_t size,
char *mid = lo + size * ((hi - lo) / size >> 1);
if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
- SWAP (mid, lo, size);
+ swap (mid, lo, size);
if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
- SWAP (mid, hi, size);
+ swap (mid, hi, size);
else
goto jump_over;
if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
- SWAP (mid, lo, size);
+ swap (mid, lo, size);
jump_over:;
left_ptr = lo + size;
@@ -144,7 +191,7 @@ __qsort_r (void *const pbase, size_t total_elems, size_t size,
if (left_ptr < right_ptr)
{
- SWAP (left_ptr, right_ptr, size);
+ swap (left_ptr, right_ptr, size);
if (mid == left_ptr)
mid = right_ptr;
else if (mid == right_ptr)
@@ -216,7 +263,7 @@ __qsort_r (void *const pbase, size_t total_elems, size_t size,
tmp_ptr = run_ptr;
if (tmp_ptr != base_ptr)
- SWAP (tmp_ptr, base_ptr, size);
+ swap (tmp_ptr, base_ptr, size);
/* Insertion sort, running from left-hand-side up to right-hand-side. */
--
2.17.1