On modern CPUs, avoiding branch misprediction is an effective way to speed up programs.
When ‘if’ slows you down, avoid it
Performance results naturally depend on the underlying hardware. The following benchmarks show the execution times for sorting 50 million integers using different sorting implementations. The measurements were taken on an Apple M1 system and on an Intel Xeon (E5-2680) system with the -O3 compiler option.
| Implementation | Apple M1 | Intel Xeon |
|---|---|---|
| Baseline Quicksort | 3.234s | 6.792s |
| Branch-Avoidant | 1.410s | 3.228s |
| std::sort (C++) | 1.190s | 4.949s |
| Branch-Avoidant Optimized | 0.870s | 1.724s |
This is a simple iterative quicksort written in C using a Lomuto‑style partition.
The pivot is placed directly into its final position during partitioning, and the algorithm always continues with the larger part while storing the smaller one on a manual stack.
To avoid the O(n²) runtime caused by many duplicates, the program switches to heap sort for a segment if it detects a significant bias in the partitioning.
// SPDX-License-Identifier: MIT
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int
#define swap(a, b) do { __typeof__(a) _h = a; a = b; b = _h; } while (0)
#define NAME "Baseline Quicksort"
TYPE* partition(TYPE* left, TYPE* right);
void heap_sort(TYPE* left, TYPE* right);
void sort(TYPE* data, int len) {
TYPE* left = data;
TYPE* stack[64]; // for max 2^32 items
unsigned sp = 0;
TYPE* right = data + len - 1;
while (1) {
long partsz = right - left;
if (partsz <= 0) goto pop_stack;
TYPE* mid = partition(left, right);
if ((mid - left) * 16 < partsz) {
heap_sort(left, right);
goto pop_stack;
}
if (mid < left + partsz / 2) {
stack[sp] = mid + 1;
stack[sp + 1] = right;
right = mid - 1;
}
else {
stack[sp] = left;
stack[sp + 1] = mid - 1;
left = mid + 1;
}
sp += 2;
continue;
pop_stack:
if (sp == 0) break;
sp -= 2;
left = stack[sp];
right = stack[sp + 1];
}
}
// ---------------------- partition ----------------------------
TYPE* partition(TYPE* left, TYPE* right)
{
TYPE* outerleft = left;
TYPE* pivp = left + (right - left) / 2;
TYPE piv = *pivp;
*pivp = *outerleft;
TYPE* mid = left;
left += 1;
while (left <= right) {
if (IS_LOWER(*left, piv)) {
mid++;
swap(*mid, *left);
}
left++;
}
*outerleft = *mid;
*mid = piv;
return mid;
}
// ---------------------- heap_sort, testing and main -----------------------------------
void heap_sort(TYPE* left, TYPE* right) {
long n = right - left + 1;
if (n < 2) return;
for (long i = n / 2, j, k; ; ) {
if (i > 0) {
k = left[--i];
}
else {
n -= 1;
if (n == 0) return;
k = left[n];
left[n] = left[0];
}
j = i;
while (j * 2 + 1 < n) {
long child = j * 2 + 1;
if (child + 1 < n && IS_LOWER(left[child], left[child + 1])) child++;
if (!IS_LOWER(k, left[child])) break;
left[j] = left[child];
j = child;
}
left[j] = k;
}
}
// ---------------------------------------------------------
unsigned chksum;
unsigned hash_el(void *p, int sz) {
unsigned char* c = (unsigned char*)p;
unsigned h = 0;
for (int i = 0; i < sz; i++) h = (h * 131) + c[i];
return h;
}
void init(TYPE* data, int len) {
chksum = 0;
for (int i = 0; i < len; i++) {
data[i] = rand();
chksum += hash_el(&data[i], sizeof(TYPE));
}
}
void test(TYPE* data, int len) {
unsigned chks = hash_el(&data[0], sizeof(TYPE));;
for (int i = 1; i < len; i++) {
if (data[i] < data[i - 1]) {
printf("ERROR ORDER\n");
break;
}
chks += hash_el(&data[i], sizeof(TYPE));
}
if (chks != chksum) printf("ERROR CHKS\n");
}
double t(void) {
static double t0;
struct timeval tv;
gettimeofday(&tv, NULL);
double h = t0;
t0 = tv.tv_sec + tv.tv_usec / 1000000.0;
return t0 - h;
}
// ---------------------------------------------------------
#define SIZE (50 * 1000000)
TYPE data[SIZE];
int main(void) {
init(data, SIZE);
printf("Sorting %d million numbers with " NAME " ...\n", SIZE / 1000000);
t();
sort(data, SIZE);
printf("%.3fs\n", t());
test(data, SIZE);
return 0;
}
Sorting 50 million numbers with Baseline Quicksort ...
3.234s
For comparison C++ std::sort:
#include <algorithm>
#include <sys/time.h>
#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int
#define NAME "C++ std::sort"
void sort(TYPE* data, int len) {
std::sort(data, data + len);
}
// ---------------------- testing and main -----------------------------------
.
.
Sorting 50 million numbers with std::sort ...
1.190s
The paper https://arxiv.org/abs/1604.06697 by Edelkamp and A. Weiß shows how partitioning performance in Quicksort can be improved by avoiding conditional branches.
The idea of using auxiliary memory for branch-avoidant partitioning is inspired by https://github.com/scandum/fluxsort .
A small sorting network was used and built on a branchless structure.
// SPDX-License-Identifier: MIT
// (c) 2026 christof.kaser@gmail.com
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#include <stddef.h>
#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int
#define NAME "Branch-Avoidant Quicksort"
TYPE* partition(TYPE* left, TYPE* right);
void sorting_network(TYPE* left, int size);
void heap_sort(TYPE* left, TYPE* right);
TYPE* partition(TYPE* left, TYPE* right);
void sort(TYPE* data, int len) {
TYPE* left = data;
TYPE* stack[80]; // for max 2^40 items
unsigned sp = 0;
TYPE* right = data + len - 1;
while (1) {
ptrdiff_t partsz = right - left;
if (partsz <= 3) {
sorting_network(left, partsz);
goto pop_stack;
}
TYPE* mid = partition(left, right);
if ((mid - left) * 16 < partsz) {
heap_sort(left, right);
goto pop_stack;
}
if (mid < left + partsz / 2) {
stack[sp] = mid + 1;
stack[sp + 1] = right;
right = mid - 1;
}
else {
stack[sp] = left;
stack[sp + 1] = mid - 1;
left = mid + 1;
}
sp += 2;
continue;
pop_stack:
if (sp == 0) break;
sp -= 2;
left = stack[sp];
right = stack[sp + 1];
}
}
// ---------------------- sorting network ----------------------------
#define sort2(a, b) do { \
__typeof__(a) x = a; \
__typeof__(a) y = b; \
unsigned m = IS_LOWER(x, y); \
a = m ? x : y; \
b = m ? y : x; \
} while(0)
#define sort3(a, b, c) do { \
sort2(a, b); sort2(b, c); sort2(a, b); \
} while(0)
#define sort4(a, b, c, d) do { \
sort2(a, b); sort2(c, d); sort2(a, c); \
sort2(b, d); sort2(b, c); \
} while(0)
void sorting_network(TYPE* l, int partsz_min_1) {
// partsz_min_1 == right - left
switch (partsz_min_1) {
case 3:
sort4(l[0], l[1], l[2], l[3]);
break;
case 2:
sort3(l[0], l[1], l[2]);
break;
case 1:
sort2(l[0], l[1]);
break;
case 0:
break;
}
}
// ---------------------- partition ----------------------------
TYPE* partition(TYPE* restrict left, TYPE* restrict right) {
TYPE* outerleft = left;
TYPE* pivp = left + (right - left) / 2;
sort3(left[1], *pivp, *right);
left += 2;
right -= 1;
TYPE piv = *pivp;
*pivp = *outerleft;
#define SWSZ 1024
TYPE _Alignas(64) swbuf[SWSZ];
TYPE* left0 = left;
TYPE* right0 = right;
TYPE* sw = swbuf;
#define UNROLL 16
while (sw < swbuf + SWSZ - UNROLL && left <= right) {
unsigned h = IS_LOWER(*right, piv);
*right0 = *sw = *right--;
right0 -= !h;
sw += h;
}
while (1) {
while (right0 > right && left <= right) {
unsigned h = IS_LOWER(*left, piv);
*left0 = *right0 = *left++;
left0 += h;
right0 -= !h;
}
while (left0 < left && left <= right) {
unsigned h = IS_LOWER(*right, piv);
*right0 = *left0 = *right--;
right0 -= !h;
left0 += h;
}
if (left > right) break;
}
memcpy(left0, swbuf, (sw - swbuf) * sizeof(TYPE));
*outerleft = *right0;
*right0 = piv;
return right0;
}
// ---------------------- heapsort, testing and main -----------------------------------
.
.
.
Sorting 50 million numbers with Branch-Avoidant Quicksort ...
1.396s
This variant handles small partitions of up to 24 elements using sorting networks, so these ranges are sorted directly instead of being partitioned further.
For larger parts of the array, the algorithm continues with branch‑avoidant partitioning and improved pivot selection.
// SPDX-License-Identifier: MIT
// (c) 2026 christof.kaser@gmail.com
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#include <stddef.h>
#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int
#define NAME "Optimized Branch-Avoidant Quicksort"
TYPE* partition(TYPE* left, TYPE* right);
void sorting_network(TYPE* left, int size);
void heap_sort(TYPE* left, TYPE* right);
TYPE* partition_small(TYPE* left, TYPE* right);
#define SMALLPART 1024
void sort(TYPE* data, int len) {
TYPE* left = data;
TYPE* stack[80]; // for max 2^40 items
unsigned sp = 0;
TYPE* right = data + len - 1;
while (1) {
ptrdiff_t partsz = right - left;
if (partsz <= 23) {
sorting_network(left, partsz);
goto pop_stack;
}
TYPE* mid;
if (partsz <= SMALLPART) mid = partition_small(left, right);
else mid = partition(left, right);
if (partsz > SMALLPART && (mid - left) * 16 < partsz) {
heap_sort(left, right);
goto pop_stack;
}
if (mid < left + partsz / 2) {
stack[sp] = mid + 1;
stack[sp + 1] = right;
right = mid - 1;
}
else {
stack[sp] = left;
stack[sp + 1] = mid - 1;
left = mid + 1;
}
sp += 2;
continue;
pop_stack:
if (sp == 0) break;
sp -= 2;
left = stack[sp];
right = stack[sp + 1];
}
}
// ---------------------- sorting network ----------------------------
#define sort2(a, b) do { \
__typeof__(a) x = a; \
__typeof__(a) y = b; \
unsigned m = IS_LOWER(x, y); \
a = m ? x : y; \
b = m ? y : x; \
} while(0)
// https://bertdobbelaere.github.io/sorting_networks.html
#define sort3(a, b, c) do { \
sort2(a, b); sort2(b, c); sort2(a, b); \
} while(0)
#define sort4(a, b, c, d) do { \
sort2(a, b); sort2(c, d); sort2(a, c); \
sort2(b, d); sort2(b, c); \
} while(0)
#define sort5(a, b, c, d, e) do { \
sort2(b, c); sort2(d, e); sort2(b, d); \
sort2(a, c); sort2(a, d); sort2(c, e); \
sort2(a, b); sort2(c, d); sort2(b, c); \
} while(0)
#define sort6(a, b, c, d, e, f) do { \
sort2(a, b); sort2(c, d); sort2(e, f); \
sort2(a, c); sort2(b, d); sort2(e, f); \
sort2(a, e); sort2(b, f); sort2(c, e); \
sort2(d, f); sort2(b, c); sort2(d, e); \
sort2(c, d); \
} while(0)
#define sort7(a, b, c, d, e, f, g) do { \
sort2(a, b); sort2(c, d); sort2(a, c); \
sort2(b, d); sort2(b, c); sort2(e, f); \
sort2(e, g); sort2(f, g); sort2(a, e); \
sort2(b, f); sort2(c, g); sort2(b, e); \
sort2(d, g); sort2(c, e); sort2(b, c); \
sort2(d, f); sort2(d, e); \
} while(0)
#define sort8(a,b,c,d,e,f,g,h) do { \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); \
sort2(b,c); sort2(f,g); \
sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
sort2(c,e); sort2(d,f); \
sort2(b,c); sort2(d,e); sort2(f,g); \
} while (0)
#define sort9(a,b,c,d,e,f,g,h,i) do { \
sort2(a,d); sort2(b,h); sort2(c,f); sort2(e,i); \
sort2(a,h); sort2(c,e); sort2(d,i); sort2(f,g); \
sort2(a,c); sort2(b,d); sort2(e,f); sort2(h,i); \
sort2(b,e); sort2(d,g); sort2(f,h); \
sort2(a,b); sort2(c,e); sort2(d,f); sort2(g,i); \
sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(b,c); sort2(d,e); sort2(f,g); \
} while (0)
#define sort10(a,b,c,d,e,f,g,h,i,j) do { \
sort2(a,i); sort2(b,j); sort2(c,h); sort2(d,f); sort2(e,g); \
sort2(a,c); sort2(b,e); sort2(f,i); sort2(h,j); \
sort2(a,d); sort2(c,e); sort2(f,h); sort2(g,j); \
sort2(a,b); sort2(d,g); sort2(i,j); \
sort2(b,f); sort2(c,d); sort2(e,i); sort2(g,h); \
sort2(b,c); sort2(d,f); sort2(e,g); sort2(h,i); \
sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(d,e); sort2(f,g); \
} while (0)
#define sort11(a,b,c,d,e,f,g,h,i,j,k) do { \
sort2(a,j); sort2(b,g); sort2(c,e); sort2(d,h); sort2(f,i); \
sort2(a,b); sort2(d,f); sort2(e,k); sort2(g,j); sort2(h,i); \
sort2(b,d); sort2(c,f); sort2(e,h); sort2(i,k); \
sort2(a,e); sort2(b,c); sort2(d,h); sort2(f,j); sort2(g,i); \
sort2(a,b); sort2(c,g); sort2(e,f); sort2(h,i); sort2(j,k); \
sort2(c,e); sort2(d,g); sort2(f,h); sort2(i,j); \
sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); \
sort2(c,d); sort2(e,f); sort2(g,h); \
} while (0)
#define sort12(a,b,c,d,e,f,g,h,i,j,k,l) do { \
sort2(a,i); sort2(b,h); sort2(c,g); sort2(d,l); sort2(e,k); sort2(f,j); \
sort2(a,c); sort2(b,e); sort2(d,f); sort2(g,i); sort2(h,k); sort2(j,l); \
sort2(a,b); sort2(c,j); sort2(e,h); sort2(f,g); sort2(k,l); \
sort2(b,d); sort2(c,h); sort2(e,j); sort2(i,k); \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
sort2(b,c); sort2(d,f); sort2(g,i); sort2(j,k); \
sort2(c,e); sort2(d,g); sort2(f,i); sort2(h,j); \
sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
} while (0)
#define sort13(a,b,c,d,e,f,g,h,i,j,k,l,m) do { \
sort2(a,l); sort2(b,h); sort2(c,e); sort2(d,f); sort2(i,j); sort2(k,m); \
sort2(a,c); sort2(d,g); sort2(e,m); sort2(f,h); sort2(i,k); \
sort2(a,i); sort2(b,d); sort2(c,f); sort2(e,j); sort2(g,l); sort2(h,m); \
sort2(a,b); sort2(c,k); sort2(d,i); sort2(e,g); sort2(j,l); \
sort2(b,d); sort2(c,e); sort2(f,k); sort2(g,i); sort2(h,j); sort2(l,m); \
sort2(b,c); sort2(d,e); sort2(f,i); sort2(g,j); sort2(h,k); \
sort2(c,d); sort2(e,h); sort2(f,g); sort2(i,l); sort2(j,k); \
sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
} while (0)
#define sort14(a,b,c,d,e,f,g,h,i,j,k,l,m,n) do { \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(i,j); sort2(k,l); sort2(m,n); \
sort2(a,c); sort2(b,d); sort2(e,i); sort2(f,j); \
sort2(k,m); sort2(l,n); \
sort2(a,k); sort2(b,g); sort2(c,l); sort2(d,n); \
sort2(f,i); sort2(h,m); \
sort2(b,e); sort2(c,i); sort2(d,g); \
sort2(f,l); sort2(h,k); sort2(j,m); \
sort2(a,b); sort2(d,j); sort2(e,k); \
sort2(f,h); sort2(g,i); sort2(m,n); \
sort2(b,f); sort2(c,e); sort2(d,h); \
sort2(g,k); sort2(i,m); sort2(j,l); \
sort2(b,c); sort2(d,f); sort2(e,g); \
sort2(h,j); sort2(i,k); sort2(l,m); \
sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(i,j); sort2(k,l); \
sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
} while (0)
#define sort15(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) do { \
sort2(a,g); sort2(b,k); sort2(c,o); sort2(d,j); \
sort2(e,m); sort2(f,n); sort2(h,l); \
sort2(a,h); sort2(c,f); sort2(d,e); \
sort2(g,l); sort2(i,k); sort2(j,m); sort2(n,o); \
sort2(b,n); sort2(c,d); sort2(e,g); \
sort2(f,j); sort2(h,i); sort2(k,o); sort2(l,m); \
sort2(a,d); sort2(b,e); sort2(f,h); \
sort2(g,n); sort2(i,j); sort2(k,l); sort2(m,o); \
sort2(a,c); sort2(b,f); sort2(d,i); \
sort2(e,g); sort2(h,k); sort2(j,l); sort2(m,n); \
sort2(a,b); sort2(c,f); sort2(d,k); \
sort2(e,i); sort2(g,h); sort2(j,m); sort2(l,n); \
sort2(b,c); sort2(d,e); sort2(f,g); \
sort2(h,j); sort2(i,k); sort2(l,m); \
sort2(d,f); sort2(e,g); sort2(h,i); sort2(j,k); \
sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(i,j); sort2(k,l); \
} while (0)
#define sort16(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) do { \
sort2(a,f); sort2(b,e); sort2(c,m); sort2(d,n); \
sort2(g,h); sort2(i,j); sort2(k,p); sort2(l,o); \
sort2(a,c); sort2(b,k); sort2(d,g); sort2(e,h); \
sort2(f,o); sort2(i,l); sort2(j,m); sort2(n,p); \
sort2(a,i); sort2(b,d); sort2(c,l); sort2(e,n); \
sort2(f,j); sort2(g,k); sort2(h,p); sort2(m,o); \
sort2(a,b); sort2(c,e); sort2(d,i); sort2(f,g); \
sort2(h,m); sort2(j,k); sort2(l,n); sort2(o,p); \
sort2(b,d); sort2(c,f); sort2(e,i); sort2(g,j); \
sort2(h,l); sort2(k,n); sort2(m,o); \
sort2(b,c); sort2(d,f); sort2(e,l); sort2(g,i); \
sort2(h,j); sort2(k,m); sort2(n,o); \
sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); \
sort2(k,l); sort2(m,n); \
sort2(e,g); sort2(f,h); sort2(i,k); sort2(j,l); \
sort2(d,e); sort2(f,g); sort2(h,i); \
sort2(j,k); sort2(l,m); \
} while (0)
#define sort17(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q) do { \
sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); \
sort2(j,k); sort2(l,m); sort2(n,o); sort2(p,q); \
sort2(b,d); sort2(c,e); sort2(f,h); sort2(g,i); \
sort2(j,l); sort2(k,m); sort2(n,p); sort2(o,q); \
sort2(b,f); sort2(c,g); sort2(d,h); sort2(e,i); \
sort2(j,n); sort2(k,o); sort2(l,p); sort2(m,q); \
sort2(a,d); sort2(b,n); sort2(c,k); sort2(e,h); \
sort2(f,l); sort2(g,m); sort2(i,j); sort2(o,p); \
sort2(a,n); sort2(b,i); sort2(c,f); sort2(d,g); \
sort2(e,o); sort2(h,p); sort2(j,q); sort2(k,l); \
sort2(a,b); sort2(c,i); sort2(d,e); sort2(f,k); \
sort2(g,n); sort2(h,l); sort2(m,o); \
sort2(b,f); sort2(d,i); sort2(e,k); sort2(g,h); \
sort2(j,m); sort2(l,n); \
sort2(b,c); sort2(e,g); sort2(f,i); sort2(h,k); \
sort2(j,l); sort2(m,o); sort2(n,p); \
sort2(c,d); sort2(e,f); sort2(g,i); sort2(h,j); \
sort2(k,l); sort2(m,n); sort2(o,p); \
sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
sort2(l,m); sort2(n,o); sort2(p,q); \
} while (0)
#define sort18(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r) do { \
sort2(a,g); sort2(b,k); sort2(c,p); sort2(d,f); sort2(e,j); \
sort2(h,q); sort2(i,n); sort2(l,r); sort2(m,o); \
sort2(a,m); sort2(b,e); sort2(d,l); sort2(f,r); sort2(g,o); \
sort2(h,i); sort2(j,k); sort2(n,q); \
sort2(b,n); sort2(c,h); sort2(e,q); sort2(g,j); \
sort2(i,l); sort2(k,p); \
sort2(a,b); sort2(c,d); sort2(e,m); sort2(f,n); \
sort2(h,j); sort2(i,k); sort2(o,p); sort2(q,r); \
sort2(a,c); sort2(b,l); sort2(d,e); sort2(f,h); \
sort2(g,q); sort2(k,m); sort2(n,o); sort2(p,r); \
sort2(b,i); sort2(e,k); sort2(f,g); sort2(h,n); \
sort2(j,q); sort2(l,m); \
sort2(b,d); sort2(c,f); sort2(e,h); sort2(g,i); \
sort2(j,l); sort2(k,n); sort2(m,p); sort2(o,q); \
sort2(b,c); sort2(d,f); sort2(e,g); sort2(h,j); \
sort2(i,k); sort2(l,n); sort2(m,o); sort2(p,q); \
sort2(c,d); sort2(f,i); sort2(g,h); sort2(j,m); \
sort2(k,l); sort2(o,p); \
sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
sort2(l,m); sort2(n,o); \
sort2(e,f); sort2(g,h); sort2(i,j); \
sort2(k,l); sort2(m,n); \
} while (0)
#define sort19(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s) do { \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(i,k); sort2(l,m); sort2(n,o); sort2(p,q); sort2(r,s); \
sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); \
sort2(i,j); sort2(l,n); sort2(m,o); sort2(p,r); sort2(q,s); \
sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
sort2(j,k); sort2(l,p); sort2(m,q); sort2(n,r); sort2(o,s); \
sort2(a,l); sort2(b,i); sort2(c,n); sort2(d,r); \
sort2(e,k); sort2(f,g); sort2(j,q); sort2(m,p); \
sort2(b,c); sort2(d,n); sort2(e,m); sort2(f,o); \
sort2(g,q); sort2(h,k); sort2(i,p); \
sort2(a,b); sort2(c,l); sort2(d,j); sort2(f,m); \
sort2(g,p); sort2(h,n); sort2(k,s); sort2(o,r); \
sort2(b,e); sort2(d,i); sort2(f,l); sort2(g,j); \
sort2(h,m); sort2(k,n); sort2(o,p); sort2(q,r); \
sort2(c,e); sort2(d,f); sort2(g,h); \
sort2(i,l); sort2(j,m); sort2(k,o); sort2(n,p); \
sort2(c,d); sort2(e,f); sort2(g,i); sort2(h,j); \
sort2(k,l); sort2(m,o); sort2(n,q); sort2(p,r); \
sort2(b,c); sort2(e,g); sort2(f,i); sort2(h,k); \
sort2(j,l); sort2(m,n); sort2(o,q); \
sort2(d,e); sort2(f,g); sort2(h,i); \
sort2(j,k); sort2(l,m); sort2(n,o); sort2(p,q); \
} while (0)
#define sort20(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t) do { \
sort2(a,m); sort2(b,n); sort2(c,o); sort2(d,p); sort2(e,q); \
sort2(f,r); sort2(g,s); sort2(h,t); sort2(i,k); sort2(j,l); \
sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); \
sort2(i,j); sort2(k,l); sort2(m,o); sort2(n,p); sort2(q,s); sort2(r,t); \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(m,n); sort2(o,p); sort2(q,r); sort2(s,t); \
sort2(a,e); sort2(b,m); sort2(c,q); sort2(d,r); \
sort2(f,i); sort2(g,j); sort2(h,s); \
sort2(k,n); sort2(l,o); sort2(p,t); \
sort2(b,g); sort2(d,k); sort2(e,f); sort2(h,l); \
sort2(i,m); sort2(j,q); sort2(n,s); sort2(o,p); \
sort2(a,e); sort2(c,i); sort2(d,j); sort2(g,h); \
sort2(k,q); sort2(l,r); sort2(m,n); sort2(p,t); \
sort2(b,e); sort2(d,g); sort2(f,i); sort2(h,k); \
sort2(j,m); sort2(l,o); sort2(n,q); sort2(p,s); \
sort2(c,d); sort2(e,f); sort2(g,i); sort2(h,j); \
sort2(k,m); sort2(l,n); sort2(o,p); sort2(q,r); \
sort2(c,e); sort2(d,g); sort2(f,h); sort2(i,k); \
sort2(j,l); sort2(m,o); sort2(n,q); sort2(p,r); \
sort2(b,c); sort2(d,f); sort2(g,h); sort2(i,j); \
sort2(k,l); sort2(m,n); sort2(o,q); sort2(r,s); \
sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
sort2(l,m); sort2(n,o); sort2(p,q); \
} while (0)
#define sort21(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u) do { \
sort2(a,h); sort2(b,k); sort2(d,f); sort2(e,i); sort2(g,n); \
sort2(j,t); sort2(l,o); sort2(m,r); sort2(p,q); sort2(s,u); \
sort2(a,l); sort2(b,p); sort2(c,m); sort2(d,e); sort2(f,i); \
sort2(g,j); sort2(h,o); sort2(k,q); sort2(n,t); sort2(r,u); \
sort2(a,g); sort2(b,d); sort2(c,s); sort2(e,p); sort2(f,k); \
sort2(i,q); sort2(l,r); sort2(m,n); sort2(o,u); \
sort2(c,g); sort2(f,m); sort2(h,s); sort2(i,o); \
sort2(j,l); sort2(k,r); sort2(n,t); sort2(q,u); \
sort2(b,c); sort2(e,h); sort2(f,j); sort2(g,r); \
sort2(k,n); sort2(l,m); sort2(o,t); sort2(p,s); \
sort2(a,c); sort2(d,g); sort2(e,f); sort2(h,k); \
sort2(i,l); sort2(j,p); sort2(m,q); sort2(n,s); \
sort2(o,r); sort2(t,u); \
sort2(a,b); sort2(c,d); sort2(f,j); sort2(g,m); \
sort2(h,i); sort2(l,o); sort2(n,p); sort2(q,t); sort2(r,s); \
sort2(b,c); sort2(d,j); sort2(g,n); sort2(k,l); \
sort2(m,p); sort2(q,r); sort2(s,t); \
sort2(b,e); sort2(c,f); sort2(d,h); sort2(g,k); \
sort2(i,j); sort2(l,m); sort2(n,o); sort2(r,s); \
sort2(c,e); sort2(f,g); sort2(h,i); sort2(j,l); \
sort2(k,n); sort2(m,p); sort2(o,q); \
sort2(d,e); sort2(f,h); sort2(g,i); sort2(j,k); \
sort2(l,n); sort2(m,o); sort2(p,q); \
sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
sort2(m,n); sort2(o,p); sort2(q,r); \
} while (0)
#define sort22(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v) do { \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
sort2(m,n); sort2(o,p); sort2(q,r); sort2(s,t); sort2(u,v); \
sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); sort2(i,k); sort2(l,n); \
sort2(o,q); sort2(p,r); sort2(s,u); sort2(t,v); \
sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); sort2(i,m); sort2(j,n); \
sort2(o,s); sort2(p,t); sort2(q,u); sort2(r,v); \
sort2(a,o); sort2(b,p); sort2(c,s); sort2(d,t); sort2(e,q); sort2(f,r); \
sort2(g,u); sort2(h,v); sort2(j,l); sort2(k,m); \
sort2(a,i); sort2(c,k); sort2(e,o); sort2(f,m); sort2(g,p); sort2(h,r); \
sort2(j,q); sort2(l,t); sort2(n,v); \
sort2(b,j); sort2(c,e); sort2(d,q); sort2(f,s); sort2(g,k); sort2(h,n); \
sort2(i,o); sort2(l,p); sort2(m,u); sort2(r,t); \
sort2(b,i); sort2(d,l); sort2(e,f); sort2(h,m); sort2(j,o); sort2(k,s); \
sort2(n,u); sort2(q,r); \
sort2(b,c); sort2(d,f); sort2(e,i); sort2(g,j); sort2(h,l); sort2(k,o); \
sort2(m,p); sort2(n,r); sort2(q,s); sort2(t,u); \
sort2(c,e); sort2(d,g); sort2(f,j); sort2(h,k); sort2(l,o); sort2(m,q); \
sort2(p,s); sort2(r,t); \
sort2(d,e); sort2(f,h); sort2(g,i); sort2(j,l); sort2(k,m); sort2(n,p); \
sort2(o,q); sort2(r,s); \
sort2(f,g); sort2(h,i); sort2(j,k); sort2(l,m); sort2(n,o); sort2(p,q); \
sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); sort2(m,n); sort2(o,p); \
sort2(q,r); \
} while (0)
#define sort23(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w) do { \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
sort2(m,n); sort2(o,p); sort2(q,r); sort2(s,t); sort2(u,v); \
sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); sort2(i,k); sort2(j,l); \
sort2(m,o); sort2(n,p); sort2(r,t); sort2(s,u); sort2(v,w); \
sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
sort2(i,m); sort2(j,n); sort2(k,o); sort2(l,p); \
sort2(q,v); sort2(r,w); \
sort2(b,k); sort2(c,j); sort2(d,l); sort2(g,t); \
sort2(m,r); sort2(o,w); sort2(q,s); sort2(u,v); \
sort2(a,q); sort2(b,c); sort2(d,v); sort2(e,r); \
sort2(f,o); sort2(g,n); sort2(h,w); sort2(j,s); \
sort2(k,u); sort2(p,t); \
sort2(b,k); sort2(c,j); sort2(d,r); sort2(e,m); \
sort2(f,s); sort2(g,u); sort2(h,p); sort2(i,q); \
sort2(l,o); sort2(n,v); sort2(t,w); \
sort2(a,i); sort2(b,e); sort2(c,k); sort2(d,j); \
sort2(f,g); sort2(l,v); sort2(m,q); sort2(n,u); \
sort2(o,p); sort2(r,s); \
sort2(c,i); sort2(d,f); sort2(e,m); sort2(g,j); \
sort2(h,l); sort2(k,q); sort2(n,r); sort2(p,v); \
sort2(s,u); \
sort2(b,c); sort2(e,i); sort2(f,k); sort2(g,m); \
sort2(h,n); sort2(j,q); sort2(l,s); sort2(o,r); \
sort2(p,t); \
sort2(c,e); sort2(d,f); sort2(g,i); sort2(h,j); \
sort2(k,m); sort2(l,n); sort2(o,q); sort2(p,u); \
sort2(r,s); sort2(t,v); \
sort2(d,g); sort2(f,i); sort2(h,k); sort2(j,m); \
sort2(l,o); sort2(n,q); sort2(p,r); sort2(s,u); \
sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
sort2(l,m); sort2(n,o); sort2(p,q); sort2(r,s); \
sort2(t,u); \
} while (0)
#define sort24(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x) do { \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
sort2(m,n); sort2(o,p); sort2(q,r); sort2(s,t); sort2(u,v); sort2(w,x); \
sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); sort2(i,k); sort2(j,l); \
sort2(m,o); sort2(n,p); sort2(q,s); sort2(r,t); sort2(u,w); sort2(v,x); \
sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
sort2(i,m); sort2(j,n); sort2(k,o); sort2(l,p); \
sort2(q,u); sort2(r,v); sort2(s,w); sort2(t,x); \
sort2(a,q); sort2(b,s); sort2(c,r); sort2(d,t); \
sort2(e,u); sort2(f,w); sort2(g,v); sort2(h,x); \
sort2(j,k); sort2(n,o); \
sort2(c,k); sort2(d,l); sort2(f,s); sort2(g,o); \
sort2(h,p); sort2(i,q); sort2(j,r); sort2(m,u); \
sort2(n,v); \
sort2(a,i); sort2(b,j); sort2(c,m); sort2(d,u); \
sort2(e,q); sort2(f,n); sort2(g,r); sort2(h,t); \
sort2(k,s); sort2(l,v); sort2(o,w); sort2(p,x); \
sort2(b,i); sort2(d,q); sort2(e,m); sort2(f,k); \
sort2(g,j); sort2(h,u); sort2(l,t); sort2(n,s); \
sort2(o,r); sort2(p,w); \
sort2(c,e); sort2(d,f); sort2(h,n); sort2(j,m); \
sort2(k,q); sort2(l,o); sort2(s,u); sort2(t,v); \
sort2(b,c); sort2(e,i); sort2(f,j); sort2(g,k); \
sort2(h,l); sort2(m,q); sort2(n,r); sort2(o,s); \
sort2(p,t); sort2(v,w); \
sort2(c,e); sort2(d,i); sort2(f,g); sort2(h,j); \
sort2(k,m); sort2(l,n); sort2(o,q); sort2(p,u); \
sort2(r,s); sort2(t,v); \
sort2(d,f); sort2(g,i); sort2(h,k); sort2(j,m); \
sort2(l,o); sort2(n,q); sort2(p,r); sort2(s,u); \
sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
sort2(l,m); sort2(n,o); sort2(p,q); sort2(r,s); sort2(t,u); \
} while (0)
void sorting_network(TYPE* l, int partsz_min_1) {
// partsz_min_1 == right - left
switch (partsz_min_1) {
case 23:
sort24(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
l[16], l[17], l[18], l[19], l[20], l[21], l[22], l[23]);
break;
case 22:
sort23(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
l[16], l[17], l[18], l[19], l[20], l[21], l[22]);
break;
case 21:
sort22(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
l[16], l[17], l[18], l[19], l[20], l[21]);
break;
case 20:
sort21(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
l[16], l[17], l[18], l[19], l[20]);
break;
case 19:
sort20(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
l[16], l[17], l[18], l[19]);
break;
case 18:
sort19(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
l[16], l[17], l[18]);
break;
case 17:
sort18(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15], l[16], l[17]);
break;
case 16:
sort17(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15], l[16]);
break;
case 15:
sort16(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15]);
break;
case 14:
sort15(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14]);
break;
case 13:
sort14(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13]);
break;
case 12:
sort13(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12]);
break;
case 11:
sort12(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11]);
break;
case 10:
sort11(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9], l[10]);
break;
case 9:
sort10(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9]);
break;
case 8:
sort9(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8]);
break;
case 7:
sort8(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7]);
break;
case 6:
sort7(l[0], l[1], l[2], l[3], l[4], l[5], l[6]);
break;
case 5:
sort6(l[0], l[1], l[2], l[3], l[4], l[5]);
break;
case 4:
sort5(l[0], l[1], l[2], l[3], l[4]);
break;
case 3:
sort4(l[0], l[1], l[2], l[3]);
break;
case 2:
sort3(l[0], l[1], l[2]);
break;
case 1:
sort2(l[0], l[1]);
break;
case 0:
break;
}
}
// ---------------------- partition ----------------------------
#define med5(a, b, c, d, e) do { \
sort2(a, b); sort2(c, d); sort2(a, c); \
sort2(b, d); sort2(b, c); sort2(c, e); \
sort2(b, c); \
} while(0)
TYPE* partition_small(TYPE* restrict left, TYPE* restrict right) {
TYPE* outerleft = left;
TYPE* pivp = left + (right - left) / 2;
med5(left[1], left[2], *pivp, right[-1], *right);
left += 3;
right -= 2;
TYPE piv = *pivp;
*pivp = *outerleft;
TYPE _Alignas(64) swbuf[SMALLPART];
TYPE* sw = swbuf;
TYPE* left0 = left;
while (left <= right) {
unsigned h = IS_LOWER(*left, piv);
*left0 = *sw = *left++;
left0 += h;
sw += !h;
}
memcpy(left0, swbuf, (sw - swbuf) * sizeof(TYPE));
left0 -= 1;
*outerleft = *left0;
*left0 = piv;
return left0;
}
TYPE* partition(TYPE* restrict left, TYPE* restrict right) {
TYPE* outerleft = left;
int n = right - left;
TYPE* pivp = left + n / 2;
TYPE piv;
TYPE* q1 = left + n / 4;
TYPE* q3 = pivp + n / 4;
left += 1;
med5(left[0], left[1], left[2], left[3], left[4]);
med5(q1[-2], q1[-1], q1[0], q1[1], q1[2]);
med5(pivp[-2], pivp[-1], pivp[0], pivp[1], pivp[2]);
med5(q3[-2], q3[-1], q3[0], q3[1], q3[2]);
med5(right[-4], right[-3], right[-2], right[-1], right[0]);
med5(left[2], q1[0], pivp[0], q3[0], right[-2]);
piv = *pivp;
*pivp = *outerleft;
#define SWSZ 1024
TYPE _Alignas(64) swbuf[SWSZ];
TYPE* left0 = left;
TYPE* right0 = right;
TYPE* sw = swbuf;
#define UNROLL 16
while (sw < swbuf + SWSZ - UNROLL && left <= right - UNROLL) {
for (int i = UNROLL; i--;) {
unsigned h = IS_LOWER(*right, piv); *right0 = *sw = *right--; right0 -= !h; sw += h;
}
}
while (sw < swbuf + SWSZ - UNROLL && left <= right) {
unsigned h = IS_LOWER(*right, piv); *right0 = *sw = *right--; right0 -= !h; sw += h;
}
while (1) {
while (right0 > right + UNROLL && left <= right - UNROLL) {
for (int i = UNROLL; i--;) {
unsigned h = IS_LOWER(*left, piv); *left0 = *right0 = *left++; left0 += h; right0 -= !h;
}
}
while (left0 < left - UNROLL && left <= right - UNROLL) {
for (int i = UNROLL; i--;) {
unsigned h = IS_LOWER(*right, piv); *right0 = *left0 = *right--; right0 -= !h; left0 += h;
}
}
if (left > right - UNROLL) break;
}
while (right0 > right && left <= right) {
unsigned h = IS_LOWER(*left, piv); *left0 = *right0 = *left++; left0 += h; right0 -= !h;
}
while (left <= right) {
unsigned h = IS_LOWER(*right, piv); *right0 = *left0 = *right--; right0 -= !h; left0 += h;
}
memcpy(left0, swbuf, (sw - swbuf) * sizeof(TYPE));
*outerleft = *right0;
*right0 = piv;
return right0;
}
// ---------------------- heapsort, testing and main -----------------------------------
.
.
Sorting 50 million numbers Optimized Branch-Avoidant Quicksort ...
0.870s
christof.kaser@gmail.com