]>
Commit | Line | Data |
---|---|---|
6d7e8eda | 1 | /* Copyright (C) 1991-2023 Free Software Foundation, Inc. |
6d52618b | 2 | This file is part of the GNU C Library. |
28f540f4 | 3 | |
6d52618b | 4 | The GNU C Library is free software; you can redistribute it and/or |
41bdb6e2 AJ |
5 | modify it under the terms of the GNU Lesser General Public |
6 | License as published by the Free Software Foundation; either | |
7 | version 2.1 of the License, or (at your option) any later version. | |
28f540f4 | 8 | |
6d52618b UD |
9 | The GNU C Library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
41bdb6e2 | 12 | Lesser General Public License for more details. |
28f540f4 | 13 | |
41bdb6e2 | 14 | You should have received a copy of the GNU Lesser General Public |
59ba27a6 | 15 | License along with the GNU C Library; if not, see |
5a82c748 | 16 | <https://www.gnu.org/licenses/>. */ |
28f540f4 | 17 | |
061d137b UD |
18 | /* If you consider tuning this algorithm, you should consult first: |
19 | Engineering a sort function; Jon Bentley and M. Douglas McIlroy; | |
20 | Software - Practice and Experience; Vol. 23 (11), 1249-1265, 1993. */ | |
21 | ||
061d137b | 22 | #include <limits.h> |
21d30c77 | 23 | #include <memswap.h> |
28f540f4 RM |
24 | #include <stdlib.h> |
25 | #include <string.h> | |
21d30c77 | 26 | #include <stdbool.h> |
28f540f4 | 27 | |
21d30c77 AZ |
28 | /* Swap SIZE bytes between addresses A and B. These helpers are provided |
29 | along the generic one as an optimization. */ | |
30 | ||
31 | enum swap_type_t | |
32 | { | |
33 | SWAP_WORDS_64, | |
34 | SWAP_WORDS_32, | |
35 | SWAP_BYTES | |
36 | }; | |
37 | ||
38 | /* If this function returns true, elements can be safely copied using word | |
39 | loads and stores. Otherwise, it might not be safe. BASE (as an integer) | |
40 | must be a multiple of the word alignment. SIZE must be a multiple of | |
41 | WORDSIZE. Since WORDSIZE must be a multiple of the word alignment, and | |
42 | WORDSIZE is a power of two on all supported platforms, this function for | |
43 | speed merely checks that BASE and SIZE are both multiples of the word | |
44 | size. */ | |
45 | static inline bool | |
46 | is_aligned (const void *base, size_t size, size_t wordsize) | |
47 | { | |
48 | return (((uintptr_t) base | size) & (wordsize - 1)) == 0; | |
49 | } | |
50 | ||
51 | static inline void | |
52 | swap_words_64 (void * restrict a, void * restrict b, size_t n) | |
53 | { | |
54 | typedef uint64_t __attribute__ ((__may_alias__)) u64_alias_t; | |
55 | do | |
56 | { | |
57 | n -= 8; | |
58 | u64_alias_t t = *(u64_alias_t *)(a + n); | |
59 | *(u64_alias_t *)(a + n) = *(u64_alias_t *)(b + n); | |
60 | *(u64_alias_t *)(b + n) = t; | |
61 | } while (n); | |
62 | } | |
63 | ||
64 | static inline void | |
65 | swap_words_32 (void * restrict a, void * restrict b, size_t n) | |
66 | { | |
67 | typedef uint32_t __attribute__ ((__may_alias__)) u32_alias_t; | |
68 | do | |
69 | { | |
70 | n -= 4; | |
71 | u32_alias_t t = *(u32_alias_t *)(a + n); | |
72 | *(u32_alias_t *)(a + n) = *(u32_alias_t *)(b + n); | |
73 | *(u32_alias_t *)(b + n) = t; | |
74 | } while (n); | |
75 | } | |
76 | ||
77 | /* Replace the indirect call with a serie of if statements. It should help | |
78 | the branch predictor. */ | |
79 | static void | |
80 | do_swap (void * restrict a, void * restrict b, size_t size, | |
81 | enum swap_type_t swap_type) | |
82 | { | |
83 | if (swap_type == SWAP_WORDS_64) | |
84 | swap_words_64 (a, b, size); | |
85 | else if (swap_type == SWAP_WORDS_32) | |
86 | swap_words_32 (a, b, size); | |
87 | else | |
88 | __memswap (a, b, size); | |
89 | } | |
28f540f4 RM |
90 | |
91 | /* Discontinue quicksort algorithm when partition gets below this size. | |
92 | This particular magic number was chosen to work best on a Sun 4/260. */ | |
93 | #define MAX_THRESH 4 | |
94 | ||
95 | /* Stack node declarations used to store unfulfilled partition obligations. */ | |
6d52618b | 96 | typedef struct |
28f540f4 RM |
97 | { |
98 | char *lo; | |
99 | char *hi; | |
274a46c9 | 100 | size_t depth; |
28f540f4 RM |
101 | } stack_node; |
102 | ||
061d137b UD |
103 | /* The stack needs log (total_elements) entries (we could even subtract |
104 | log(MAX_THRESH)). Since total_elements has type size_t, we get as | |
105 | upper bound for log (total_elements): | |
106 | bits per byte (CHAR_BIT) * sizeof(size_t). */ | |
d097f3c7 AZ |
107 | enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) }; |
108 | ||
109 | static inline stack_node * | |
274a46c9 | 110 | push (stack_node *top, char *lo, char *hi, size_t depth) |
d097f3c7 AZ |
111 | { |
112 | top->lo = lo; | |
113 | top->hi = hi; | |
274a46c9 | 114 | top->depth = depth; |
d097f3c7 AZ |
115 | return ++top; |
116 | } | |
117 | ||
118 | static inline stack_node * | |
274a46c9 | 119 | pop (stack_node *top, char **lo, char **hi, size_t *depth) |
d097f3c7 AZ |
120 | { |
121 | --top; | |
122 | *lo = top->lo; | |
123 | *hi = top->hi; | |
274a46c9 | 124 | *depth = top->depth; |
d097f3c7 AZ |
125 | return top; |
126 | } | |
28f540f4 | 127 | |
274a46c9 AZ |
128 | /* NB: N is inclusive bound for BASE. */ |
129 | static inline void | |
130 | siftdown (void *base, size_t size, size_t k, size_t n, | |
131 | enum swap_type_t swap_type, __compar_d_fn_t cmp, void *arg) | |
132 | { | |
133 | while (k <= n / 2) | |
134 | { | |
135 | size_t j = 2 * k; | |
136 | if (j < n && cmp (base + (j * size), base + ((j + 1) * size), arg) < 0) | |
137 | j++; | |
138 | ||
f8cfb683 | 139 | if (j == k || cmp (base + (k * size), base + (j * size), arg) >= 0) |
274a46c9 AZ |
140 | break; |
141 | ||
142 | do_swap (base + (size * j), base + (k * size), size, swap_type); | |
143 | k = j; | |
144 | } | |
145 | } | |
146 | ||
147 | static inline void | |
148 | heapify (void *base, size_t size, size_t n, enum swap_type_t swap_type, | |
149 | __compar_d_fn_t cmp, void *arg) | |
150 | { | |
151 | size_t k = n / 2; | |
152 | while (1) | |
153 | { | |
154 | siftdown (base, size, k, n, swap_type, cmp, arg); | |
155 | if (k-- == 0) | |
156 | break; | |
157 | } | |
158 | } | |
159 | ||
160 | /* A non-recursive heapsort, used on introsort implementation as a fallback | |
161 | routine with worst-case performance of O(nlog n) and worst-case space | |
162 | complexity of O(1). It sorts the array starting at BASE and ending at | |
163 | END, with each element of SIZE bytes. The SWAP_TYPE is the callback | |
164 | function used to swap elements, and CMP is the function used to compare | |
165 | elements. */ | |
166 | static void | |
167 | heapsort_r (void *base, void *end, size_t size, enum swap_type_t swap_type, | |
168 | __compar_d_fn_t cmp, void *arg) | |
169 | { | |
170 | const size_t count = ((uintptr_t) end - (uintptr_t) base) / size; | |
171 | ||
172 | if (count < 2) | |
173 | return; | |
174 | ||
175 | size_t n = count - 1; | |
176 | ||
177 | /* Build the binary heap, largest value at the base[0]. */ | |
178 | heapify (base, size, n, swap_type, cmp, arg); | |
179 | ||
180 | /* On each iteration base[0:n] is the binary heap, while base[n:count] | |
181 | is sorted. */ | |
182 | while (n > 0) | |
183 | { | |
184 | do_swap (base, base + (n * size), size, swap_type); | |
185 | n--; | |
186 | siftdown (base, size, 0, n, swap_type, cmp, arg); | |
187 | } | |
188 | } | |
28f540f4 | 189 | |
a035a985 AZ |
190 | static inline void |
191 | insertion_sort_qsort_partitions (void *const pbase, size_t total_elems, | |
192 | size_t size, enum swap_type_t swap_type, | |
193 | __compar_d_fn_t cmp, void *arg) | |
194 | { | |
195 | char *base_ptr = (char *) pbase; | |
196 | char *const end_ptr = &base_ptr[size * (total_elems - 1)]; | |
197 | char *tmp_ptr = base_ptr; | |
198 | #define min(x, y) ((x) < (y) ? (x) : (y)) | |
199 | const size_t max_thresh = MAX_THRESH * size; | |
200 | char *thresh = min(end_ptr, base_ptr + max_thresh); | |
201 | char *run_ptr; | |
202 | ||
203 | /* Find smallest element in first threshold and place it at the | |
204 | array's beginning. This is the smallest array element, | |
205 | and the operation speeds up insertion sort's inner loop. */ | |
206 | ||
207 | for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size) | |
208 | if (cmp (run_ptr, tmp_ptr, arg) < 0) | |
209 | tmp_ptr = run_ptr; | |
210 | ||
211 | if (tmp_ptr != base_ptr) | |
212 | do_swap (tmp_ptr, base_ptr, size, swap_type); | |
213 | ||
214 | /* Insertion sort, running from left-hand-side up to right-hand-side. */ | |
215 | ||
216 | run_ptr = base_ptr + size; | |
217 | while ((run_ptr += size) <= end_ptr) | |
218 | { | |
219 | tmp_ptr = run_ptr - size; | |
e4d8117b | 220 | while (run_ptr != tmp_ptr && cmp (run_ptr, tmp_ptr, arg) < 0) |
a035a985 AZ |
221 | tmp_ptr -= size; |
222 | ||
223 | tmp_ptr += size; | |
224 | if (tmp_ptr != run_ptr) | |
225 | { | |
226 | char *trav; | |
227 | ||
228 | trav = run_ptr + size; | |
229 | while (--trav >= run_ptr) | |
230 | { | |
231 | char c = *trav; | |
232 | char *hi, *lo; | |
233 | ||
234 | for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo) | |
235 | *hi = *lo; | |
236 | *hi = c; | |
237 | } | |
238 | } | |
239 | } | |
240 | } | |
241 | ||
28f540f4 RM |
242 | /* Order size using quicksort. This implementation incorporates |
243 | four optimizations discussed in Sedgewick: | |
244 | ||
6d52618b UD |
245 | 1. Non-recursive, using an explicit stack of pointer that store the |
246 | next array partition to sort. To save time, this maximum amount | |
061d137b UD |
247 | of space required to store an array of SIZE_MAX is allocated on the |
248 | stack. Assuming a 32-bit (64 bit) integer for size_t, this needs | |
249 | only 32 * sizeof(stack_node) == 256 bytes (for 64 bit: 1024 bytes). | |
250 | Pretty cheap, actually. | |
28f540f4 RM |
251 | |
252 | 2. Chose the pivot element using a median-of-three decision tree. | |
6d52618b | 253 | This reduces the probability of selecting a bad pivot value and |
28f540f4 RM |
254 | eliminates certain extraneous comparisons. |
255 | ||
256 | 3. Only quicksorts TOTAL_ELEMS / MAX_THRESH partitions, leaving | |
6d52618b | 257 | insertion sort to order the MAX_THRESH items within each partition. |
28f540f4 | 258 | This is a big win, since insertion sort is faster for small, mostly |
6d52618b | 259 | sorted array segments. |
28f540f4 RM |
260 | |
261 | 4. The larger of the two sub-partitions is always pushed onto the | |
262 | stack first, with the algorithm then concentrating on the | |
061d137b | 263 | smaller partition. This *guarantees* no more than log (total_elems) |
28f540f4 RM |
264 | stack size is needed (actually O(1) in this case)! */ |
265 | ||
266 | void | |
03bf8357 AZ |
267 | __qsort_r (void *const pbase, size_t total_elems, size_t size, |
268 | __compar_d_fn_t cmp, void *arg) | |
28f540f4 | 269 | { |
2e09a79a | 270 | char *base_ptr = (char *) pbase; |
28f540f4 | 271 | |
7cc27f44 | 272 | const size_t max_thresh = MAX_THRESH * size; |
28f540f4 | 273 | |
274a46c9 | 274 | if (total_elems <= 1) |
28f540f4 RM |
275 | /* Avoid lossage with unsigned arithmetic below. */ |
276 | return; | |
277 | ||
21d30c77 AZ |
278 | enum swap_type_t swap_type; |
279 | if (is_aligned (pbase, size, 8)) | |
280 | swap_type = SWAP_WORDS_64; | |
281 | else if (is_aligned (pbase, size, 4)) | |
282 | swap_type = SWAP_WORDS_32; | |
283 | else | |
284 | swap_type = SWAP_BYTES; | |
285 | ||
274a46c9 AZ |
286 | /* Maximum depth before quicksort switches to heapsort. */ |
287 | size_t depth = 2 * (sizeof (size_t) * CHAR_BIT - 1 | |
288 | - __builtin_clzl (total_elems)); | |
289 | ||
28f540f4 RM |
290 | if (total_elems > MAX_THRESH) |
291 | { | |
292 | char *lo = base_ptr; | |
293 | char *hi = &lo[size * (total_elems - 1)]; | |
28f540f4 | 294 | stack_node stack[STACK_SIZE]; |
274a46c9 | 295 | stack_node *top = push (stack, NULL, NULL, depth); |
28f540f4 | 296 | |
d097f3c7 | 297 | while (stack < top) |
28f540f4 | 298 | { |
274a46c9 AZ |
299 | if (depth == 0) |
300 | { | |
301 | heapsort_r (lo, hi, size, swap_type, cmp, arg); | |
302 | top = pop (top, &lo, &hi, &depth); | |
303 | continue; | |
304 | } | |
305 | ||
28f540f4 RM |
306 | char *left_ptr; |
307 | char *right_ptr; | |
308 | ||
28f540f4 | 309 | /* Select median value from among LO, MID, and HI. Rearrange |
6d52618b UD |
310 | LO and HI so the three values are sorted. This lowers the |
311 | probability of picking a pathological pivot value and | |
061d137b UD |
312 | skips a comparison for both the LEFT_PTR and RIGHT_PTR in |
313 | the while loops. */ | |
28f540f4 RM |
314 | |
315 | char *mid = lo + size * ((hi - lo) / size >> 1); | |
316 | ||
e458144c | 317 | if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) |
21d30c77 | 318 | do_swap (mid, lo, size, swap_type); |
e458144c | 319 | if ((*cmp) ((void *) hi, (void *) mid, arg) < 0) |
21d30c77 | 320 | do_swap (mid, hi, size, swap_type); |
6d52618b | 321 | else |
28f540f4 | 322 | goto jump_over; |
e458144c | 323 | if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) |
21d30c77 | 324 | do_swap (mid, lo, size, swap_type); |
28f540f4 | 325 | jump_over:; |
28f540f4 RM |
326 | |
327 | left_ptr = lo + size; | |
6d52618b | 328 | right_ptr = hi - size; |
28f540f4 | 329 | |
6d52618b UD |
330 | /* Here's the famous ``collapse the walls'' section of quicksort. |
331 | Gotta like those tight inner loops! They are the main reason | |
28f540f4 | 332 | that this algorithm runs much faster than others. */ |
6d52618b | 333 | do |
28f540f4 | 334 | { |
f8cfb683 FW |
335 | while (left_ptr != mid |
336 | && (*cmp) ((void *) left_ptr, (void *) mid, arg) < 0) | |
28f540f4 RM |
337 | left_ptr += size; |
338 | ||
f8cfb683 FW |
339 | while (right_ptr != mid |
340 | && (*cmp) ((void *) mid, (void *) right_ptr, arg) < 0) | |
28f540f4 RM |
341 | right_ptr -= size; |
342 | ||
6d52618b | 343 | if (left_ptr < right_ptr) |
28f540f4 | 344 | { |
21d30c77 | 345 | do_swap (left_ptr, right_ptr, size, swap_type); |
fa8d436c UD |
346 | if (mid == left_ptr) |
347 | mid = right_ptr; | |
348 | else if (mid == right_ptr) | |
349 | mid = left_ptr; | |
28f540f4 RM |
350 | left_ptr += size; |
351 | right_ptr -= size; | |
352 | } | |
6d52618b | 353 | else if (left_ptr == right_ptr) |
28f540f4 RM |
354 | { |
355 | left_ptr += size; | |
356 | right_ptr -= size; | |
357 | break; | |
358 | } | |
6d52618b | 359 | } |
28f540f4 RM |
360 | while (left_ptr <= right_ptr); |
361 | ||
362 | /* Set up pointers for next iteration. First determine whether | |
6d52618b | 363 | left and right partitions are below the threshold size. If so, |
28f540f4 RM |
364 | ignore one or both. Otherwise, push the larger partition's |
365 | bounds on the stack and continue sorting the smaller one. */ | |
366 | ||
367 | if ((size_t) (right_ptr - lo) <= max_thresh) | |
368 | { | |
369 | if ((size_t) (hi - left_ptr) <= max_thresh) | |
370 | /* Ignore both small partitions. */ | |
274a46c9 | 371 | top = pop (top, &lo, &hi, &depth); |
28f540f4 | 372 | else |
6d52618b | 373 | /* Ignore small left partition. */ |
28f540f4 RM |
374 | lo = left_ptr; |
375 | } | |
376 | else if ((size_t) (hi - left_ptr) <= max_thresh) | |
377 | /* Ignore small right partition. */ | |
378 | hi = right_ptr; | |
379 | else if ((right_ptr - lo) > (hi - left_ptr)) | |
6d52618b | 380 | { |
28f540f4 | 381 | /* Push larger left partition indices. */ |
274a46c9 | 382 | top = push (top, lo, right_ptr, depth - 1); |
28f540f4 RM |
383 | lo = left_ptr; |
384 | } | |
385 | else | |
6d52618b | 386 | { |
28f540f4 | 387 | /* Push larger right partition indices. */ |
274a46c9 | 388 | top = push (top, left_ptr, hi, depth - 1); |
28f540f4 RM |
389 | hi = right_ptr; |
390 | } | |
391 | } | |
392 | } | |
393 | ||
394 | /* Once the BASE_PTR array is partially sorted by quicksort the rest | |
6d52618b UD |
395 | is completely sorted using insertion sort, since this is efficient |
396 | for partitions below MAX_THRESH size. BASE_PTR points to the beginning | |
28f540f4 RM |
397 | of the array to sort, and END_PTR points at the very last element in |
398 | the array (*not* one beyond it!). */ | |
a035a985 AZ |
399 | insertion_sort_qsort_partitions (pbase, total_elems, size, swap_type, cmp, |
400 | arg); | |
28f540f4 | 401 | } |
03bf8357 AZ |
402 | libc_hidden_def (__qsort_r) |
403 | weak_alias (__qsort_r, qsort_r) | |
404 | ||
405 | void | |
406 | qsort (void *b, size_t n, size_t s, __compar_fn_t cmp) | |
407 | { | |
408 | return __qsort_r (b, n, s, (__compar_d_fn_t) cmp, NULL); | |
409 | } | |
410 | libc_hidden_def (qsort) |