Multithreaded Branch‑Avoidant Quicksort

When ‘if’ slows you down, avoid it

Fast Branch-Avoidant Quicksort with Sorting-Networks

Branch-Avoidant Multi-Threaded Quicksort

New threads are created only for large partitions and are limited so that we don’t start too many threads. Smaller partitions are processed in the current thread.

// SPDX-License-Identifier: MIT
// (c) 2026 christof.kaser@gmail.com
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#include <pthread.h>
#include <unistd.h>
#include <stdatomic.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int

#define NAME "Threaded Branchless Quicksort"

void heap_sort(TYPE* left, TYPE* right);
void sorting_network(TYPE* left, int size);
TYPE* partition(TYPE* left, TYPE* right);
TYPE* partition_small(TYPE* left, TYPE* right);

#define SMALLPART 256

int max_threads;
atomic_int n_threads;

pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;

void* sort_thr(void *arg) {

    TYPE** stack = (TYPE**)arg;
    int sp = 0;
    TYPE* left = stack[sp];
    TYPE* right = stack[sp + 1];

    while (1) {
        int partsz = right - left;
        if (partsz <= 11) {
            sorting_network(left, partsz);
            goto pop_stack;
        }
        TYPE* l;
        TYPE* r;

        TYPE* mid;
        if (partsz <= SMALLPART) mid = partition_small(left, right);
        else mid = partition(left, right);

        if (partsz > SMALLPART && mid - left < 100) {
            heap_sort(left, right);
            goto pop_stack;
        }

        if (mid < left + partsz / 2) {
            l = mid + 1;
            r = right;
            right = mid - 1;
        }
        else {
            l = left;
            r = mid - 1;
            left = mid + 1;
        }
        // soft limit: may exceed max_threads
        TYPE** stack_new = NULL;
        if (r - l > 1000000 && n_threads < max_threads) {
            // start a new thread - max_threads is a soft limit
            stack_new = malloc(64 * sizeof(TYPE*));
            // stack size 64 is sufficient for n <= 2^32
            if (stack_new) {
                stack_new[0] = l;
                stack_new[1] = r;

                pthread_mutex_lock(&mtx);
                pthread_t thread;
                if (pthread_create(&thread, NULL, sort_thr, stack_new) == 0) {
                    n_threads += 1;
                    pthread_detach(thread);
                }
                else {
                    free(stack_new);
                    stack_new = NULL;
                }
                pthread_mutex_unlock(&mtx);
            }
        }
        if (stack_new == NULL) {
            stack[sp] = l;
            stack[sp + 1] = r;
            sp += 2;
        }
        continue;

    pop_stack:
        if (sp == 0) break;
        sp -= 2;
        left = stack[sp];
        right = stack[sp + 1];
    }

    free(stack);
    pthread_mutex_lock(&mtx);
    n_threads -= 1;
    if (n_threads == 0) pthread_cond_signal(&cond);
    pthread_mutex_unlock(&mtx);

    return NULL;
}

void sort(TYPE* data, int len) {

    int n_cpus = sysconf(_SC_NPROCESSORS_ONLN);
    // printf("CPUs: %d\n", n_cpus);
    if (n_cpus > 0) max_threads = n_cpus * 2;
    else max_threads = 8;

    pthread_t thread;
    TYPE** stack = malloc(64 * sizeof(TYPE*));
    stack[0] = data;
    stack[1] = data + len - 1;
    n_threads = 1;
    pthread_create(&thread, NULL, sort_thr, stack);

    pthread_mutex_lock(&mtx);
    while (n_threads != 0) pthread_cond_wait(&cond, &mtx);
    pthread_mutex_unlock(&mtx);
}

// ---------------------- sorting network ----------------------------

#define sort2(a, b) do {  \
    TYPE x = a; \
    TYPE y = b; \
    unsigned m = IS_LOWER(x, y); \
    a = m ? x : y;  \
    b = m ? y : x;  \
} while(0)

// https://bertdobbelaere.github.io/sorting_networks.html

#define sort3(a,b,c) do { \
    sort2(a,c);\
    sort2(a,b);\
    sort2(b,c);\
} while(0)

#define sort4(a,b,c,d) do { \
    sort2(a,c);sort2(b,d);\
    sort2(a,b);sort2(c,d);\
    sort2(b,c);\
} while(0)

#define sort5(a,b,c,d,e) do { \
    sort2(a,d);sort2(b,e);\
    sort2(a,c);sort2(b,d);\
    sort2(a,b);sort2(c,e);\
    sort2(b,c);sort2(d,e);\
    sort2(c,d);\
} while(0)

#define sort6(a,b,c,d,e,f) do { \
    sort2(a,f);sort2(b,d);sort2(c,e);\
    sort2(b,c);sort2(d,e);\
    sort2(a,d);sort2(c,f);\
    sort2(a,b);sort2(c,d);sort2(e,f);\
    sort2(b,c);sort2(d,e);\
} while(0)

#define sort7(a,b,c,d,e,f,g) do { \
    sort2(a,g);sort2(c,d);sort2(e,f);\
    sort2(a,c);sort2(b,e);sort2(d,g);\
    sort2(a,b);sort2(c,f);sort2(d,e);\
    sort2(b,c);sort2(e,g);\
    sort2(c,d);sort2(e,f);\
    sort2(b,c);sort2(d,e);sort2(f,g);\
} while(0)

#define sort8(a,b,c,d,e,f,g,h) do { \
    sort2(a,c);sort2(b,d);sort2(e,g);sort2(f,h);\
    sort2(a,e);sort2(b,f);sort2(c,g);sort2(d,h);\
    sort2(a,b);sort2(c,d);sort2(e,f);sort2(g,h);\
    sort2(c,e);sort2(d,f);\
    sort2(b,e);sort2(d,g);\
    sort2(b,c);sort2(d,e);sort2(f,g);\
} while (0)

#define sort9(a,b,c,d,e,f,g,h,i) do { \
    sort2(a,d);sort2(b,h);sort2(c,f);sort2(e,i);\
    sort2(a,h);sort2(c,e);sort2(d,i);sort2(f,g);\
    sort2(a,c);sort2(b,d);sort2(e,f);sort2(h,i);\
    sort2(b,e);sort2(d,g);sort2(f,h);\
    sort2(a,b);sort2(c,e);sort2(d,f);sort2(g,i);\
    sort2(c,d);sort2(e,f);sort2(g,h);\
    sort2(b,c);sort2(d,e);sort2(f,g);\
} while (0)

#define sort10(a,b,c,d,e,f,g,h,i,j) do { \
    sort2(a,i);sort2(b,j);sort2(c,h);sort2(d,f);sort2(e,g);\
    sort2(a,c);sort2(b,e);sort2(f,i);sort2(h,j);\
    sort2(a,d);sort2(c,e);sort2(f,h);sort2(g,j);\
    sort2(a,b);sort2(d,g);sort2(i,j);\
    sort2(b,f);sort2(c,d);sort2(e,i);sort2(g,h);\
    sort2(b,c);sort2(d,f);sort2(e,g);sort2(h,i);\
    sort2(c,d);sort2(e,f);sort2(g,h);\
    sort2(d,e);sort2(f,g);\
} while(0)

#define sort11(a,b,c,d,e,f,g,h,i,j,k) do { \
    sort2(a,j);sort2(b,g);sort2(c,e);sort2(d,h);sort2(f,i);\
    sort2(a,b);sort2(d,f);sort2(e,k);sort2(g,j);sort2(h,i);\
    sort2(b,d);sort2(c,f);sort2(e,h);sort2(i,k);\
    sort2(a,e);sort2(b,c);sort2(d,h);sort2(f,j);sort2(g,i);\
    sort2(a,b);sort2(c,g);sort2(e,f);sort2(h,i);sort2(j,k);\
    sort2(c,e);sort2(d,g);sort2(f,h);sort2(i,j);\
    sort2(b,c);sort2(d,e);sort2(f,g);sort2(h,i);\
    sort2(c,d);sort2(e,f);sort2(g,h);\
} while (0)

#define sort12(a,b,c,d,e,f,g,h,i,j,k,l) do { \
    sort2(a,i);sort2(b,h);sort2(c,g);sort2(d,l);sort2(e,k);sort2(f,j);\
    sort2(a,b);sort2(c,f);sort2(d,e);sort2(g,j);sort2(h,i);sort2(k,l);\
    sort2(a,c);sort2(b,g);sort2(f,k);sort2(j,l);\
    sort2(a,d);sort2(b,c);sort2(e,g);sort2(f,h);sort2(i,l);sort2(j,k);\
    sort2(b,e);sort2(d,f);sort2(g,i);sort2(h,k);\
    sort2(b,d);sort2(c,f);sort2(g,j);sort2(i,k);\
    sort2(c,d);sort2(e,f);sort2(g,h);sort2(i,j);\
    sort2(e,g);sort2(f,h);\
    sort2(d,e);sort2(f,g);sort2(h,i);\
} while (0)

void sorting_network(TYPE* l, int partsz_min_1) {
    // partsz_min_1 == right - left
    switch (partsz_min_1) {
    case 11:
        sort12(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11]);
        break;
    case 10:
        sort11(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9], l[10]);
        break;
    case 9:
        sort10(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9]);
        break;
    case 8:
        sort9(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8]);
        break;
    case 7:
        sort8(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7]);
        break;
    case 6:
        sort7(l[0], l[1], l[2], l[3], l[4], l[5], l[6]);
        break;
    case 5:
        sort6(l[0], l[1], l[2], l[3], l[4], l[5]);
        break;
    case 4:
        sort5(l[0], l[1], l[2], l[3], l[4]);
        break;
    case 3:
        sort4(l[0], l[1], l[2], l[3]);
        break;
    case 2:
        sort3(l[0], l[1], l[2]);
        break;
    case 1:
        sort2(l[0], l[1]);
        break;
    case 0:
        break;
    }
}
// ---------------------- partition ----------------------------

#define med5(a, b, c, d, e) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); sort2(c, e); \
    sort2(b, c); \
} while(0)

TYPE* partition_small(TYPE* restrict  left, TYPE* restrict  right) {
    TYPE* outerleft = left;
    TYPE* pivp = left + (right - left) / 2;

    med5(left[1], left[2], *pivp, right[-1], *right);
    left += 3;
    right -= 2;

    TYPE piv = *pivp;
    *pivp = *outerleft;

    TYPE swbuf[SMALLPART];
    TYPE* sw = swbuf;
    TYPE* lwr = left;
    while (left <= right) {
        unsigned h = IS_LOWER(*left, piv); *lwr = *sw = *left++; lwr += h; sw += !h;
    }
    memcpy(lwr, swbuf, (sw - swbuf) * sizeof(TYPE));
    lwr -= 1;
    *outerleft = *lwr;
    *lwr = piv;
    return lwr;
}

#define SWSZ 2048
#define UNROLL 16

TYPE* partition(TYPE* restrict left, TYPE* restrict right) {

    TYPE* outerleft = left;
    TYPE* pivp = left + (right - left) / 2;

    med5(left[3], left[4], left[1], left[5], left[6]);
    med5(left[11], left[12], left[2], left[13], left[14]);
    med5(pivp[-20], pivp[-10], pivp[0], pivp[10], pivp[20]);
    med5(right[-6], right[-7], right[-1], right[-8], right[-9]);
    med5(right[-15], right[-14], right[0], right[-13], right[-12]);
    med5(left[1], left[2], pivp[0], right[-1], right[0]);
    left += 3;
    right -= 2;

    TYPE piv = *pivp;
    *pivp = *outerleft;

    TYPE swbuf[SWSZ];

    TYPE* lwr = left;
    TYPE* rwr = right;

    TYPE* sw = swbuf;
    while (sw < swbuf + SWSZ - UNROLL && left <= right - UNROLL) {
        for (int i = UNROLL; i--;) {
            unsigned h = IS_LOWER(*right, piv); *rwr = *sw = *right--; rwr -= !h; sw += h;
        }
    }
    while (sw < swbuf + SWSZ - UNROLL && left <= right) {
        unsigned h = IS_LOWER(*right, piv); *rwr = *sw = *right--; rwr -= !h; sw += h;
    }
    while (left <= right - UNROLL) {
        while (rwr > right + UNROLL && left <= right - UNROLL) {
            for (int i = UNROLL; i--;) {
                unsigned h = IS_LOWER(*left, piv); *lwr = *rwr = *left++; lwr += h; rwr -= !h;
            }
        }
        while (lwr < left - UNROLL && left <= right - UNROLL) {
            for (int i = UNROLL; i--;) {
                unsigned h = IS_LOWER(*right, piv); *rwr = *lwr = *right--; rwr -= !h; lwr += h;
            }
        }
    }
    while (rwr > right && left <= right) {
        unsigned h = IS_LOWER(*left, piv); *lwr = *rwr = *left++; lwr += h; rwr -= !h;
    }
    while (left <= right) {
        unsigned h = IS_LOWER(*right, piv); *rwr = *lwr = *right--; rwr -= !h; lwr += h;
    }
    memcpy(lwr, swbuf, (sw - swbuf) * sizeof(TYPE));

    *outerleft = *rwr;
    *rwr = piv;
    return rwr;
}

// ---------------------- heap_sort, testing and main -----------------------------------

void heap_sort(TYPE* left, TYPE* right) {

    long n = right - left + 1;
    if (n < 2) return;
    for (long i = n / 2, j; ; ) {
        TYPE k;
        if (i > 0) {
            k = left[--i];
        }
        else {
            n -= 1;
            if (n == 0) return;
            k = left[n];
            left[n] = left[0];
        }
        j = i;
        while (j * 2 + 1 < n) {
            long child = j * 2 + 1;
            if (child + 1 < n && IS_LOWER(left[child], left[child + 1])) child++;
            if (!IS_LOWER(k, left[child])) break;
            left[j] = left[child];
            j = child;
        }
        left[j] = k;
    }
}

unsigned chksum;
unsigned hash_el(void *p, int sz) {
    unsigned char* c = (unsigned char*)p;
    unsigned h = 0;
    for (int i = 0; i < sz; i++) h = (h * 131) + c[i];
    return h;
}
void init(TYPE* data, int len) {
    chksum = 0;
    for (int i = 0; i < len; i++) {
        data[i] = rand();
        chksum += hash_el(&data[i], sizeof(TYPE));
    }
}
void test(TYPE* data, int len) {
    unsigned chks = hash_el(&data[0], sizeof(TYPE));;
    for (int i = 1; i < len; i++) {
        if (IS_LOWER(data[i], data[i - 1])) {
            printf("ERROR ORDER\n");
            return;
        }
        chks += hash_el(&data[i], sizeof(TYPE));
    }
    if (chks != chksum) printf("ERROR CHKS\n");
}
double t(void) {

    static double twr;
    struct timeval tv;
    gettimeofday(&tv, NULL);
    double h = twr;
    twr = tv.tv_sec + tv.tv_usec / 1000000.0;
    return twr - h;
}

#define SIZE (50 * 1000000)
TYPE data[SIZE];

int main(void) {

    srand(time(NULL));
    init(data, SIZE);
    printf(NAME"\n");
    printf("Sorting %d million integers ...\n", SIZE / 1000000);
    t();
    sort(data, SIZE);
    printf("%.2fs\n", t());
    test(data, SIZE);
    return 0;
}
The measurements were taken on an Apple M1 system with the -O3 compiler option.

Sorting 50 million numbers with Threaded Branch-Avoidant Quicksort ...
0.23s
Multithreading and the avoidance of branch mispredictions can significantly improve the application performance of programs.


Interactive sorting demo


christof.kaser@gmail.com