When ‘if’ slows you down, avoid it
Fast Branch-Avoidant Quicksort with Sorting-Networks
New threads are created only for large partitions and are limited so that we don’t start too many threads. Smaller partitions are processed in the current thread.
// SPDX-License-Identifier: MIT
// (c) 2026 christof.kaser@gmail.com
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#include <pthread.h>
#include <unistd.h>
#include <stdatomic.h>
#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int
#define NAME "Threaded Branchless Quicksort"
void heap_sort(TYPE* left, TYPE* right);
void sorting_network(TYPE* left, int size);
TYPE* partition(TYPE* left, TYPE* right);
TYPE* partition_small(TYPE* left, TYPE* right);
#define SMALLPART 256
int max_threads;
atomic_int n_threads;
pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
void* sort_thr(void *arg) {
TYPE** stack = (TYPE**)arg;
int sp = 0;
TYPE* left = stack[sp];
TYPE* right = stack[sp + 1];
while (1) {
int partsz = right - left;
if (partsz <= 11) {
sorting_network(left, partsz);
goto pop_stack;
}
TYPE* l;
TYPE* r;
TYPE* mid;
if (partsz <= SMALLPART) mid = partition_small(left, right);
else mid = partition(left, right);
if (partsz > SMALLPART && mid - left < 100) {
heap_sort(left, right);
goto pop_stack;
}
if (mid < left + partsz / 2) {
l = mid + 1;
r = right;
right = mid - 1;
}
else {
l = left;
r = mid - 1;
left = mid + 1;
}
// soft limit: may exceed max_threads
TYPE** stack_new = NULL;
if (r - l > 1000000 && n_threads < max_threads) {
// start a new thread - max_threads is a soft limit
stack_new = malloc(64 * sizeof(TYPE*));
// stack size 64 is sufficient for n <= 2^32
if (stack_new) {
stack_new[0] = l;
stack_new[1] = r;
pthread_mutex_lock(&mtx);
pthread_t thread;
if (pthread_create(&thread, NULL, sort_thr, stack_new) == 0) {
n_threads += 1;
pthread_detach(thread);
}
else {
free(stack_new);
stack_new = NULL;
}
pthread_mutex_unlock(&mtx);
}
}
if (stack_new == NULL) {
stack[sp] = l;
stack[sp + 1] = r;
sp += 2;
}
continue;
pop_stack:
if (sp == 0) break;
sp -= 2;
left = stack[sp];
right = stack[sp + 1];
}
free(stack);
pthread_mutex_lock(&mtx);
n_threads -= 1;
if (n_threads == 0) pthread_cond_signal(&cond);
pthread_mutex_unlock(&mtx);
return NULL;
}
void sort(TYPE* data, int len) {
int n_cpus = sysconf(_SC_NPROCESSORS_ONLN);
// printf("CPUs: %d\n", n_cpus);
if (n_cpus > 0) max_threads = n_cpus * 2;
else max_threads = 8;
pthread_t thread;
TYPE** stack = malloc(64 * sizeof(TYPE*));
stack[0] = data;
stack[1] = data + len - 1;
n_threads = 1;
pthread_create(&thread, NULL, sort_thr, stack);
pthread_mutex_lock(&mtx);
while (n_threads != 0) pthread_cond_wait(&cond, &mtx);
pthread_mutex_unlock(&mtx);
}
// ---------------------- sorting network ----------------------------
#define sort2(a, b) do { \
TYPE x = a; \
TYPE y = b; \
unsigned m = IS_LOWER(x, y); \
a = m ? x : y; \
b = m ? y : x; \
} while(0)
// https://bertdobbelaere.github.io/sorting_networks.html
#define sort3(a,b,c) do { \
sort2(a,c);\
sort2(a,b);\
sort2(b,c);\
} while(0)
#define sort4(a,b,c,d) do { \
sort2(a,c);sort2(b,d);\
sort2(a,b);sort2(c,d);\
sort2(b,c);\
} while(0)
#define sort5(a,b,c,d,e) do { \
sort2(a,d);sort2(b,e);\
sort2(a,c);sort2(b,d);\
sort2(a,b);sort2(c,e);\
sort2(b,c);sort2(d,e);\
sort2(c,d);\
} while(0)
#define sort6(a,b,c,d,e,f) do { \
sort2(a,f);sort2(b,d);sort2(c,e);\
sort2(b,c);sort2(d,e);\
sort2(a,d);sort2(c,f);\
sort2(a,b);sort2(c,d);sort2(e,f);\
sort2(b,c);sort2(d,e);\
} while(0)
#define sort7(a,b,c,d,e,f,g) do { \
sort2(a,g);sort2(c,d);sort2(e,f);\
sort2(a,c);sort2(b,e);sort2(d,g);\
sort2(a,b);sort2(c,f);sort2(d,e);\
sort2(b,c);sort2(e,g);\
sort2(c,d);sort2(e,f);\
sort2(b,c);sort2(d,e);sort2(f,g);\
} while(0)
#define sort8(a,b,c,d,e,f,g,h) do { \
sort2(a,c);sort2(b,d);sort2(e,g);sort2(f,h);\
sort2(a,e);sort2(b,f);sort2(c,g);sort2(d,h);\
sort2(a,b);sort2(c,d);sort2(e,f);sort2(g,h);\
sort2(c,e);sort2(d,f);\
sort2(b,e);sort2(d,g);\
sort2(b,c);sort2(d,e);sort2(f,g);\
} while (0)
#define sort9(a,b,c,d,e,f,g,h,i) do { \
sort2(a,d);sort2(b,h);sort2(c,f);sort2(e,i);\
sort2(a,h);sort2(c,e);sort2(d,i);sort2(f,g);\
sort2(a,c);sort2(b,d);sort2(e,f);sort2(h,i);\
sort2(b,e);sort2(d,g);sort2(f,h);\
sort2(a,b);sort2(c,e);sort2(d,f);sort2(g,i);\
sort2(c,d);sort2(e,f);sort2(g,h);\
sort2(b,c);sort2(d,e);sort2(f,g);\
} while (0)
#define sort10(a,b,c,d,e,f,g,h,i,j) do { \
sort2(a,i);sort2(b,j);sort2(c,h);sort2(d,f);sort2(e,g);\
sort2(a,c);sort2(b,e);sort2(f,i);sort2(h,j);\
sort2(a,d);sort2(c,e);sort2(f,h);sort2(g,j);\
sort2(a,b);sort2(d,g);sort2(i,j);\
sort2(b,f);sort2(c,d);sort2(e,i);sort2(g,h);\
sort2(b,c);sort2(d,f);sort2(e,g);sort2(h,i);\
sort2(c,d);sort2(e,f);sort2(g,h);\
sort2(d,e);sort2(f,g);\
} while(0)
#define sort11(a,b,c,d,e,f,g,h,i,j,k) do { \
sort2(a,j);sort2(b,g);sort2(c,e);sort2(d,h);sort2(f,i);\
sort2(a,b);sort2(d,f);sort2(e,k);sort2(g,j);sort2(h,i);\
sort2(b,d);sort2(c,f);sort2(e,h);sort2(i,k);\
sort2(a,e);sort2(b,c);sort2(d,h);sort2(f,j);sort2(g,i);\
sort2(a,b);sort2(c,g);sort2(e,f);sort2(h,i);sort2(j,k);\
sort2(c,e);sort2(d,g);sort2(f,h);sort2(i,j);\
sort2(b,c);sort2(d,e);sort2(f,g);sort2(h,i);\
sort2(c,d);sort2(e,f);sort2(g,h);\
} while (0)
#define sort12(a,b,c,d,e,f,g,h,i,j,k,l) do { \
sort2(a,i);sort2(b,h);sort2(c,g);sort2(d,l);sort2(e,k);sort2(f,j);\
sort2(a,b);sort2(c,f);sort2(d,e);sort2(g,j);sort2(h,i);sort2(k,l);\
sort2(a,c);sort2(b,g);sort2(f,k);sort2(j,l);\
sort2(a,d);sort2(b,c);sort2(e,g);sort2(f,h);sort2(i,l);sort2(j,k);\
sort2(b,e);sort2(d,f);sort2(g,i);sort2(h,k);\
sort2(b,d);sort2(c,f);sort2(g,j);sort2(i,k);\
sort2(c,d);sort2(e,f);sort2(g,h);sort2(i,j);\
sort2(e,g);sort2(f,h);\
sort2(d,e);sort2(f,g);sort2(h,i);\
} while (0)
void sorting_network(TYPE* l, int partsz_min_1) {
// partsz_min_1 == right - left
switch (partsz_min_1) {
case 11:
sort12(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11]);
break;
case 10:
sort11(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9], l[10]);
break;
case 9:
sort10(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9]);
break;
case 8:
sort9(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8]);
break;
case 7:
sort8(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7]);
break;
case 6:
sort7(l[0], l[1], l[2], l[3], l[4], l[5], l[6]);
break;
case 5:
sort6(l[0], l[1], l[2], l[3], l[4], l[5]);
break;
case 4:
sort5(l[0], l[1], l[2], l[3], l[4]);
break;
case 3:
sort4(l[0], l[1], l[2], l[3]);
break;
case 2:
sort3(l[0], l[1], l[2]);
break;
case 1:
sort2(l[0], l[1]);
break;
case 0:
break;
}
}
// ---------------------- partition ----------------------------
#define med5(a, b, c, d, e) do { \
sort2(a, b); sort2(c, d); sort2(a, c); \
sort2(b, d); sort2(b, c); sort2(c, e); \
sort2(b, c); \
} while(0)
TYPE* partition_small(TYPE* restrict left, TYPE* restrict right) {
TYPE* outerleft = left;
TYPE* pivp = left + (right - left) / 2;
med5(left[1], left[2], *pivp, right[-1], *right);
left += 3;
right -= 2;
TYPE piv = *pivp;
*pivp = *outerleft;
TYPE swbuf[SMALLPART];
TYPE* sw = swbuf;
TYPE* lwr = left;
while (left <= right) {
unsigned h = IS_LOWER(*left, piv); *lwr = *sw = *left++; lwr += h; sw += !h;
}
memcpy(lwr, swbuf, (sw - swbuf) * sizeof(TYPE));
lwr -= 1;
*outerleft = *lwr;
*lwr = piv;
return lwr;
}
#define SWSZ 2048
#define UNROLL 16
TYPE* partition(TYPE* restrict left, TYPE* restrict right) {
TYPE* outerleft = left;
TYPE* pivp = left + (right - left) / 2;
med5(left[3], left[4], left[1], left[5], left[6]);
med5(left[11], left[12], left[2], left[13], left[14]);
med5(pivp[-20], pivp[-10], pivp[0], pivp[10], pivp[20]);
med5(right[-6], right[-7], right[-1], right[-8], right[-9]);
med5(right[-15], right[-14], right[0], right[-13], right[-12]);
med5(left[1], left[2], pivp[0], right[-1], right[0]);
left += 3;
right -= 2;
TYPE piv = *pivp;
*pivp = *outerleft;
TYPE swbuf[SWSZ];
TYPE* lwr = left;
TYPE* rwr = right;
TYPE* sw = swbuf;
while (sw < swbuf + SWSZ - UNROLL && left <= right - UNROLL) {
for (int i = UNROLL; i--;) {
unsigned h = IS_LOWER(*right, piv); *rwr = *sw = *right--; rwr -= !h; sw += h;
}
}
while (sw < swbuf + SWSZ - UNROLL && left <= right) {
unsigned h = IS_LOWER(*right, piv); *rwr = *sw = *right--; rwr -= !h; sw += h;
}
while (left <= right - UNROLL) {
while (rwr > right + UNROLL && left <= right - UNROLL) {
for (int i = UNROLL; i--;) {
unsigned h = IS_LOWER(*left, piv); *lwr = *rwr = *left++; lwr += h; rwr -= !h;
}
}
while (lwr < left - UNROLL && left <= right - UNROLL) {
for (int i = UNROLL; i--;) {
unsigned h = IS_LOWER(*right, piv); *rwr = *lwr = *right--; rwr -= !h; lwr += h;
}
}
}
while (rwr > right && left <= right) {
unsigned h = IS_LOWER(*left, piv); *lwr = *rwr = *left++; lwr += h; rwr -= !h;
}
while (left <= right) {
unsigned h = IS_LOWER(*right, piv); *rwr = *lwr = *right--; rwr -= !h; lwr += h;
}
memcpy(lwr, swbuf, (sw - swbuf) * sizeof(TYPE));
*outerleft = *rwr;
*rwr = piv;
return rwr;
}
// ---------------------- heap_sort, testing and main -----------------------------------
void heap_sort(TYPE* left, TYPE* right) {
long n = right - left + 1;
if (n < 2) return;
for (long i = n / 2, j; ; ) {
TYPE k;
if (i > 0) {
k = left[--i];
}
else {
n -= 1;
if (n == 0) return;
k = left[n];
left[n] = left[0];
}
j = i;
while (j * 2 + 1 < n) {
long child = j * 2 + 1;
if (child + 1 < n && IS_LOWER(left[child], left[child + 1])) child++;
if (!IS_LOWER(k, left[child])) break;
left[j] = left[child];
j = child;
}
left[j] = k;
}
}
unsigned chksum;
unsigned hash_el(void *p, int sz) {
unsigned char* c = (unsigned char*)p;
unsigned h = 0;
for (int i = 0; i < sz; i++) h = (h * 131) + c[i];
return h;
}
void init(TYPE* data, int len) {
chksum = 0;
for (int i = 0; i < len; i++) {
data[i] = rand();
chksum += hash_el(&data[i], sizeof(TYPE));
}
}
void test(TYPE* data, int len) {
unsigned chks = hash_el(&data[0], sizeof(TYPE));;
for (int i = 1; i < len; i++) {
if (IS_LOWER(data[i], data[i - 1])) {
printf("ERROR ORDER\n");
return;
}
chks += hash_el(&data[i], sizeof(TYPE));
}
if (chks != chksum) printf("ERROR CHKS\n");
}
double t(void) {
static double twr;
struct timeval tv;
gettimeofday(&tv, NULL);
double h = twr;
twr = tv.tv_sec + tv.tv_usec / 1000000.0;
return twr - h;
}
#define SIZE (50 * 1000000)
TYPE data[SIZE];
int main(void) {
srand(time(NULL));
init(data, SIZE);
printf(NAME"\n");
printf("Sorting %d million integers ...\n", SIZE / 1000000);
t();
sort(data, SIZE);
printf("%.2fs\n", t());
test(data, SIZE);
return 0;
}
The measurements were taken on an Apple M1 system with the -O3 compiler option.
Sorting 50 million numbers with Threaded Branch-Avoidant Quicksort ...
0.23s
Multithreading and the avoidance of branch mispredictions can significantly improve the application performance of programs.
christof.kaser@gmail.com