A Fast Quicksort in C with Threads and Branch‑Avoidant Coding

When ‘if’ slows you down, avoid it

Branchless Quicksort

Branch-Avoidant Multi-Threaded Quicksort

New threads are created only for large partitions and are limited so that we don’t start too many threads. Smaller partitions are processed in the current thread.

// SPDX-License-Identifier: MIT
// (c) 2026 christof.kaser@gmail.com
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#include <pthread.h>
#include <unistd.h>
#include <stdatomic.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int
#define swap(a, b) do { __typeof__(a) h = a; a = b; b = h; } while (0)

#define NAME "Threaded Branchless Quicksort"

TYPE* partition(TYPE* left, TYPE* right);
void sorting_network(TYPE* left, int size);
void heap_sort(TYPE* left, TYPE* right);
TYPE* partition_small(TYPE* left, TYPE* right);

#define SMALLPART 3072

int max_threads;
atomic_int n_threads;

pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;

void* sort_thr(void *arg) {

    TYPE** stack = (TYPE**)arg;
    int sp = 0;
    TYPE* left = stack[sp];
    TYPE* right = stack[sp + 1];

    while (1) {
        int partsz = right - left;
        if (partsz <= 15) {
            sorting_network(left, partsz);
            goto pop_stack;
        }
        TYPE* l;
        TYPE* r;

        TYPE* mid;
        if (partsz <= SMALLPART) mid = partition_small(left, right);
        else mid = partition(left, right);

        if (partsz > SMALLPART && mid - left < 100) {
            heap_sort(left, right);
            goto pop_stack;
        }

        if (mid < left + partsz / 2) {
            l = mid + 1;
            r = right;
            right = mid - 1;
        }
        else {
            l = left;
            r = mid - 1;
            left = mid + 1;
        }
        // soft limit: may exceed max_threads
        TYPE** stack_new = NULL;
        if (r - l > 1000000 && n_threads < max_threads) {
            // start a new thread - max_threads is a soft limit
            stack_new = malloc(64 * sizeof(TYPE*));
            // stack size 64 is sufficient for n <= 2^32
            if (stack_new) {
                stack_new[0] = l;
                stack_new[1] = r;

                pthread_mutex_lock(&mtx);
                pthread_t thread;
                if (pthread_create(&thread, NULL, sort_thr, stack_new) == 0) {
                    n_threads += 1;
                    pthread_detach(thread);
                }
                else {
                    free(stack_new);
                    stack_new = NULL;
                }
                pthread_mutex_unlock(&mtx);
            }
        }
        if (stack_new == NULL) {
            stack[sp] = l;
            stack[sp + 1] = r;
            sp += 2;
        }
        continue;

    pop_stack:
        if (sp == 0) break;
        sp -= 2;
        left = stack[sp];
        right = stack[sp + 1];
    }

    free(stack);
    pthread_mutex_lock(&mtx);
    n_threads -= 1;
    if (n_threads == 0) pthread_cond_signal(&cond);
    pthread_mutex_unlock(&mtx);

    return NULL;
}

void sort(TYPE* data, int len) {

    int n_cpus = sysconf(_SC_NPROCESSORS_ONLN);
    // printf("CPUs: %d\n", n_cpus);
    if (n_cpus > 0) max_threads = n_cpus * 2;
    else max_threads = 8;

    for (TYPE* p = data + 1; p < data + len; p++) {
        if (*p < *data) swap(*p, *data);
    }
    pthread_t thread;
    TYPE** stack = malloc(64 * sizeof(TYPE*));
    stack[0] = data + 1;
    stack[1] = data + len - 1;
    n_threads = 1;
    pthread_create(&thread, NULL, sort_thr, stack);

    pthread_mutex_lock(&mtx);
    while (n_threads != 0) pthread_cond_wait(&cond, &mtx);
    pthread_mutex_unlock(&mtx);
}

// ---------------------- sorting network ----------------------------

#define sort2(a, b) do {  \
    __typeof__(a) x = a; \
    __typeof__(a) y = b; \
    unsigned m = IS_LOWER(x, y); \
    a = m ? x : y;  \
    b = m ? y : x;  \
} while(0)

// https://bertdobbelaere.github.io/sorting_networks.html

#define sort3(a, b, c) do { \
    sort2(a, b); sort2(b, c); sort2(a, b); \
} while(0)

#define sort4(a, b, c, d) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); \
} while(0)

#define sort5(a, b, c, d, e) do { \
    sort2(b, c); sort2(d, e); sort2(b, d); \
    sort2(a, c); sort2(a, d); sort2(c, e); \
    sort2(a, b); sort2(c, d); sort2(b, c); \
} while(0)

#define sort6(a, b, c, d, e, f) do { \
    sort2(a, b); sort2(c, d); sort2(e, f); \
    sort2(a, c); sort2(b, d); sort2(e, f); \
    sort2(a, e); sort2(b, f); sort2(c, e); \
    sort2(d, f); sort2(b, c); sort2(d, e); \
    sort2(c, d); \
} while(0)

#define sort7(a, b, c, d, e, f, g) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); sort2(e, f); \
    sort2(e, g); sort2(f, g); sort2(a, e); \
    sort2(b, f); sort2(c, g); sort2(b, e); \
    sort2(d, g); sort2(c, e); sort2(b, c); \
    sort2(d, f); sort2(d, e); \
} while(0)

#define sort8(a,b,c,d,e,f,g,h) do { \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); \
    sort2(b,c); sort2(f,g); \
    sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
    sort2(c,e); sort2(d,f); \
    sort2(b,c); sort2(d,e); sort2(f,g); \
} while (0)

#define sort9(a,b,c,d,e,f,g,h,i) do { \
    sort2(a,d); sort2(b,h); sort2(c,f); sort2(e,i); \
    sort2(a,h); sort2(c,e); sort2(d,i); sort2(f,g); \
    sort2(a,c); sort2(b,d); sort2(e,f); sort2(h,i); \
    sort2(b,e); sort2(d,g); sort2(f,h); \
    sort2(a,b); sort2(c,e); sort2(d,f); sort2(g,i); \
    sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(b,c); sort2(d,e); sort2(f,g); \
} while (0)

#define sort10(a,b,c,d,e,f,g,h,i,j) do { \
    sort2(a,i); sort2(b,j); sort2(c,h); sort2(d,f); sort2(e,g); \
    sort2(a,c); sort2(b,e); sort2(f,i); sort2(h,j); \
    sort2(a,d); sort2(c,e); sort2(f,h); sort2(g,j); \
    sort2(a,b); sort2(d,g); sort2(i,j); \
    sort2(b,f); sort2(c,d); sort2(e,i); sort2(g,h); \
    sort2(b,c); sort2(d,f); sort2(e,g); sort2(h,i); \
    sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(d,e); sort2(f,g); \
} while (0)

#define sort11(a,b,c,d,e,f,g,h,i,j,k) do { \
    sort2(a,j); sort2(b,g); sort2(c,e); sort2(d,h); sort2(f,i); \
    sort2(a,b); sort2(d,f); sort2(e,k); sort2(g,j); sort2(h,i); \
    sort2(b,d); sort2(c,f); sort2(e,h); sort2(i,k); \
    sort2(a,e); sort2(b,c); sort2(d,h); sort2(f,j); sort2(g,i); \
    sort2(a,b); sort2(c,g); sort2(e,f); sort2(h,i); sort2(j,k); \
    sort2(c,e); sort2(d,g); sort2(f,h); sort2(i,j); \
    sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); \
    sort2(c,d); sort2(e,f); sort2(g,h); \
} while (0)

#define sort12(a,b,c,d,e,f,g,h,i,j,k,l) do { \
    sort2(a,i); sort2(b,h); sort2(c,g); sort2(d,l); sort2(e,k); sort2(f,j); \
    sort2(a,c); sort2(b,e); sort2(d,f); sort2(g,i); sort2(h,k); sort2(j,l); \
    sort2(a,b); sort2(c,j); sort2(e,h); sort2(f,g); sort2(k,l); \
    sort2(b,d); sort2(c,h); sort2(e,j); sort2(i,k); \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
    sort2(b,c); sort2(d,f); sort2(g,i); sort2(j,k); \
    sort2(c,e); sort2(d,g); sort2(f,i); sort2(h,j); \
    sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
} while (0)

#define sort13(a,b,c,d,e,f,g,h,i,j,k,l,m) do { \
    sort2(a,l); sort2(b,h); sort2(c,e); sort2(d,f); sort2(i,j); sort2(k,m); \
    sort2(a,c); sort2(d,g); sort2(e,m); sort2(f,h); sort2(i,k); \
    sort2(a,i); sort2(b,d); sort2(c,f); sort2(e,j); sort2(g,l); sort2(h,m); \
    sort2(a,b); sort2(c,k); sort2(d,i); sort2(e,g); sort2(j,l); \
    sort2(b,d); sort2(c,e); sort2(f,k); sort2(g,i); sort2(h,j); sort2(l,m); \
    sort2(b,c); sort2(d,e); sort2(f,i); sort2(g,j); sort2(h,k); \
    sort2(c,d); sort2(e,h); sort2(f,g); sort2(i,l); sort2(j,k); \
    sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
    sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
} while (0)

#define sort14(a,b,c,d,e,f,g,h,i,j,k,l,m,n) do { \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(i,j); sort2(k,l); sort2(m,n); \
    sort2(a,c); sort2(b,d); sort2(e,i); sort2(f,j); \
    sort2(k,m); sort2(l,n); \
    sort2(a,k); sort2(b,g); sort2(c,l); sort2(d,n); \
    sort2(f,i); sort2(h,m); \
    sort2(b,e); sort2(c,i); sort2(d,g); \
    sort2(f,l); sort2(h,k); sort2(j,m); \
    sort2(a,b); sort2(d,j); sort2(e,k); \
    sort2(f,h); sort2(g,i); sort2(m,n); \
    sort2(b,f); sort2(c,e); sort2(d,h); \
    sort2(g,k); sort2(i,m); sort2(j,l); \
    sort2(b,c); sort2(d,f); sort2(e,g); \
    sort2(h,j); sort2(i,k); sort2(l,m); \
    sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(i,j); sort2(k,l); \
    sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
} while (0)

#define sort15(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) do { \
    sort2(a,g); sort2(b,k); sort2(c,o); sort2(d,j); \
    sort2(e,m); sort2(f,n); sort2(h,l); \
    sort2(a,h); sort2(c,f); sort2(d,e); \
    sort2(g,l); sort2(i,k); sort2(j,m); sort2(n,o); \
    sort2(b,n); sort2(c,d); sort2(e,g); \
    sort2(f,j); sort2(h,i); sort2(k,o); sort2(l,m); \
    sort2(a,d); sort2(b,e); sort2(f,h); \
    sort2(g,n); sort2(i,j); sort2(k,l); sort2(m,o); \
    sort2(a,c); sort2(b,f); sort2(d,i); \
    sort2(e,g); sort2(h,k); sort2(j,l); sort2(m,n); \
    sort2(a,b); sort2(c,f); sort2(d,k); \
    sort2(e,i); sort2(g,h); sort2(j,m); sort2(l,n); \
    sort2(b,c); sort2(d,e); sort2(f,g); \
    sort2(h,j); sort2(i,k); sort2(l,m); \
    sort2(d,f); sort2(e,g); sort2(h,i); sort2(j,k); \
    sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(i,j); sort2(k,l); \
} while (0)

#define sort16(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) do { \
    sort2(a,f); sort2(b,e); sort2(c,m); sort2(d,n); \
    sort2(g,h); sort2(i,j); sort2(k,p); sort2(l,o); \
    sort2(a,c); sort2(b,k); sort2(d,g); sort2(e,h); \
    sort2(f,o); sort2(i,l); sort2(j,m); sort2(n,p); \
    sort2(a,i); sort2(b,d); sort2(c,l); sort2(e,n); \
    sort2(f,j); sort2(g,k); sort2(h,p); sort2(m,o); \
    sort2(a,b); sort2(c,e); sort2(d,i); sort2(f,g); \
    sort2(h,m); sort2(j,k); sort2(l,n); sort2(o,p); \
    sort2(b,d); sort2(c,f); sort2(e,i); sort2(g,j); \
    sort2(h,l); sort2(k,n); sort2(m,o); \
    sort2(b,c); sort2(d,f); sort2(e,l); sort2(g,i); \
    sort2(h,j); sort2(k,m); sort2(n,o); \
    sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); \
    sort2(k,l); sort2(m,n); \
    sort2(e,g); sort2(f,h); sort2(i,k); sort2(j,l); \
    sort2(d,e); sort2(f,g); sort2(h,i); \
    sort2(j,k); sort2(l,m); \
} while (0)

#define sort17(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q) do { \
    sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); \
    sort2(j,k); sort2(l,m); sort2(n,o); sort2(p,q); \
    sort2(b,d); sort2(c,e); sort2(f,h); sort2(g,i); \
    sort2(j,l); sort2(k,m); sort2(n,p); sort2(o,q); \
    sort2(b,f); sort2(c,g); sort2(d,h); sort2(e,i); \
    sort2(j,n); sort2(k,o); sort2(l,p); sort2(m,q); \
    sort2(a,d); sort2(b,n); sort2(c,k); sort2(e,h); \
    sort2(f,l); sort2(g,m); sort2(i,j); sort2(o,p); \
    sort2(a,n); sort2(b,i); sort2(c,f); sort2(d,g); \
    sort2(e,o); sort2(h,p); sort2(j,q); sort2(k,l); \
    sort2(a,b); sort2(c,i); sort2(d,e); sort2(f,k); \
    sort2(g,n); sort2(h,l); sort2(m,o); \
    sort2(b,f); sort2(d,i); sort2(e,k); sort2(g,h); \
    sort2(j,m); sort2(l,n); \
    sort2(b,c); sort2(e,g); sort2(f,i); sort2(h,k); \
    sort2(j,l); sort2(m,o); sort2(n,p); \
    sort2(c,d); sort2(e,f); sort2(g,i); sort2(h,j); \
    sort2(k,l); sort2(m,n); sort2(o,p); \
    sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
    sort2(l,m); sort2(n,o); sort2(p,q); \
} while (0)

#define sort18(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r) do { \
    sort2(a,g); sort2(b,k); sort2(c,p); sort2(d,f); sort2(e,j); \
    sort2(h,q); sort2(i,n); sort2(l,r); sort2(m,o); \
    sort2(a,m); sort2(b,e); sort2(d,l); sort2(f,r); sort2(g,o); \
    sort2(h,i); sort2(j,k); sort2(n,q); \
    sort2(b,n); sort2(c,h); sort2(e,q); sort2(g,j); \
    sort2(i,l); sort2(k,p); \
    sort2(a,b); sort2(c,d); sort2(e,m); sort2(f,n); \
    sort2(h,j); sort2(i,k); sort2(o,p); sort2(q,r); \
    sort2(a,c); sort2(b,l); sort2(d,e); sort2(f,h); \
    sort2(g,q); sort2(k,m); sort2(n,o); sort2(p,r); \
    sort2(b,i); sort2(e,k); sort2(f,g); sort2(h,n); \
    sort2(j,q); sort2(l,m); \
    sort2(b,d); sort2(c,f); sort2(e,h); sort2(g,i); \
    sort2(j,l); sort2(k,n); sort2(m,p); sort2(o,q); \
    sort2(b,c); sort2(d,f); sort2(e,g); sort2(h,j); \
    sort2(i,k); sort2(l,n); sort2(m,o); sort2(p,q); \
    sort2(c,d); sort2(f,i); sort2(g,h); sort2(j,m); \
    sort2(k,l); sort2(o,p); \
    sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
    sort2(l,m); sort2(n,o); \
    sort2(e,f); sort2(g,h); sort2(i,j); \
    sort2(k,l); sort2(m,n); \
} while (0)
#define sort19(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s) do { \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(i,k); sort2(l,m); sort2(n,o); sort2(p,q); sort2(r,s); \
    sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); \
    sort2(i,j); sort2(l,n); sort2(m,o); sort2(p,r); sort2(q,s); \
    sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
    sort2(j,k); sort2(l,p); sort2(m,q); sort2(n,r); sort2(o,s); \
    sort2(a,l); sort2(b,i); sort2(c,n); sort2(d,r); \
    sort2(e,k); sort2(f,g); sort2(j,q); sort2(m,p); \
    sort2(b,c); sort2(d,n); sort2(e,m); sort2(f,o); \
    sort2(g,q); sort2(h,k); sort2(i,p); \
    sort2(a,b); sort2(c,l); sort2(d,j); sort2(f,m); \
    sort2(g,p); sort2(h,n); sort2(k,s); sort2(o,r); \
    sort2(b,e); sort2(d,i); sort2(f,l); sort2(g,j); \
    sort2(h,m); sort2(k,n); sort2(o,p); sort2(q,r); \
    sort2(c,e); sort2(d,f); sort2(g,h); \
    sort2(i,l); sort2(j,m); sort2(k,o); sort2(n,p); \
    sort2(c,d); sort2(e,f); sort2(g,i); sort2(h,j); \
    sort2(k,l); sort2(m,o); sort2(n,q); sort2(p,r); \
    sort2(b,c); sort2(e,g); sort2(f,i); sort2(h,k); \
    sort2(j,l); sort2(m,n); sort2(o,q); \
    sort2(d,e); sort2(f,g); sort2(h,i); \
    sort2(j,k); sort2(l,m); sort2(n,o); sort2(p,q); \
} while (0)

#define sort20(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t) do { \
    sort2(a,m); sort2(b,n); sort2(c,o); sort2(d,p); sort2(e,q); \
    sort2(f,r); sort2(g,s); sort2(h,t); sort2(i,k); sort2(j,l); \
    sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); \
    sort2(i,j); sort2(k,l); sort2(m,o); sort2(n,p); sort2(q,s); sort2(r,t); \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(m,n); sort2(o,p); sort2(q,r); sort2(s,t); \
    sort2(a,e); sort2(b,m); sort2(c,q); sort2(d,r); \
    sort2(f,i); sort2(g,j); sort2(h,s); \
    sort2(k,n); sort2(l,o); sort2(p,t); \
    sort2(b,g); sort2(d,k); sort2(e,f); sort2(h,l); \
    sort2(i,m); sort2(j,q); sort2(n,s); sort2(o,p); \
    sort2(a,e); sort2(c,i); sort2(d,j); sort2(g,h); \
    sort2(k,q); sort2(l,r); sort2(m,n); sort2(p,t); \
    sort2(b,e); sort2(d,g); sort2(f,i); sort2(h,k); \
    sort2(j,m); sort2(l,o); sort2(n,q); sort2(p,s); \
    sort2(c,d); sort2(e,f); sort2(g,i); sort2(h,j); \
    sort2(k,m); sort2(l,n); sort2(o,p); sort2(q,r); \
    sort2(c,e); sort2(d,g); sort2(f,h); sort2(i,k); \
    sort2(j,l); sort2(m,o); sort2(n,q); sort2(p,r); \
    sort2(b,c); sort2(d,f); sort2(g,h); sort2(i,j); \
    sort2(k,l); sort2(m,n); sort2(o,q); sort2(r,s); \
    sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
    sort2(l,m); sort2(n,o); sort2(p,q); \
} while (0)

#define sort21(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u) do { \
    sort2(a,h); sort2(b,k); sort2(d,f); sort2(e,i); sort2(g,n); \
    sort2(j,t); sort2(l,o); sort2(m,r); sort2(p,q); sort2(s,u); \
    sort2(a,l); sort2(b,p); sort2(c,m); sort2(d,e); sort2(f,i); \
    sort2(g,j); sort2(h,o); sort2(k,q); sort2(n,t); sort2(r,u); \
    sort2(a,g); sort2(b,d); sort2(c,s); sort2(e,p); sort2(f,k); \
    sort2(i,q); sort2(l,r); sort2(m,n); sort2(o,u); \
    sort2(c,g); sort2(f,m); sort2(h,s); sort2(i,o); \
    sort2(j,l); sort2(k,r); sort2(n,t); sort2(q,u); \
    sort2(b,c); sort2(e,h); sort2(f,j); sort2(g,r); \
    sort2(k,n); sort2(l,m); sort2(o,t); sort2(p,s); \
    sort2(a,c); sort2(d,g); sort2(e,f); sort2(h,k); \
    sort2(i,l); sort2(j,p); sort2(m,q); sort2(n,s); \
    sort2(o,r); sort2(t,u); \
    sort2(a,b); sort2(c,d); sort2(f,j); sort2(g,m); \
    sort2(h,i); sort2(l,o); sort2(n,p); sort2(q,t); sort2(r,s); \
    sort2(b,c); sort2(d,j); sort2(g,n); sort2(k,l); \
    sort2(m,p); sort2(q,r); sort2(s,t); \
    sort2(b,e); sort2(c,f); sort2(d,h); sort2(g,k); \
    sort2(i,j); sort2(l,m); sort2(n,o); sort2(r,s); \
    sort2(c,e); sort2(f,g); sort2(h,i); sort2(j,l); \
    sort2(k,n); sort2(m,p); sort2(o,q); \
    sort2(d,e); sort2(f,h); sort2(g,i); sort2(j,k); \
    sort2(l,n); sort2(m,o); sort2(p,q); \
    sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
    sort2(m,n); sort2(o,p); sort2(q,r); \
} while (0)
#define sort22(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v) do { \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
    sort2(m,n); sort2(o,p); sort2(q,r); sort2(s,t); sort2(u,v); \
    sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); sort2(i,k); sort2(l,n); \
    sort2(o,q); sort2(p,r); sort2(s,u); sort2(t,v); \
    sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); sort2(i,m); sort2(j,n); \
    sort2(o,s); sort2(p,t); sort2(q,u); sort2(r,v); \
    sort2(a,o); sort2(b,p); sort2(c,s); sort2(d,t); sort2(e,q); sort2(f,r); \
    sort2(g,u); sort2(h,v); sort2(j,l); sort2(k,m); \
    sort2(a,i); sort2(c,k); sort2(e,o); sort2(f,m); sort2(g,p); sort2(h,r); \
    sort2(j,q); sort2(l,t); sort2(n,v); \
    sort2(b,j); sort2(c,e); sort2(d,q); sort2(f,s); sort2(g,k); sort2(h,n); \
    sort2(i,o); sort2(l,p); sort2(m,u); sort2(r,t); \
    sort2(b,i); sort2(d,l); sort2(e,f); sort2(h,m); sort2(j,o); sort2(k,s); \
    sort2(n,u); sort2(q,r); \
    sort2(b,c); sort2(d,f); sort2(e,i); sort2(g,j); sort2(h,l); sort2(k,o); \
    sort2(m,p); sort2(n,r); sort2(q,s); sort2(t,u); \
    sort2(c,e); sort2(d,g); sort2(f,j); sort2(h,k); sort2(l,o); sort2(m,q); \
    sort2(p,s); sort2(r,t); \
    sort2(d,e); sort2(f,h); sort2(g,i); sort2(j,l); sort2(k,m); sort2(n,p); \
    sort2(o,q); sort2(r,s); \
    sort2(f,g); sort2(h,i); sort2(j,k); sort2(l,m); sort2(n,o); sort2(p,q); \
    sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); sort2(m,n); sort2(o,p); \
    sort2(q,r); \
} while (0)

#define sort23(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w) do { \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
    sort2(m,n); sort2(o,p); sort2(q,r); sort2(s,t); sort2(u,v); \
    sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); sort2(i,k); sort2(j,l); \
    sort2(m,o); sort2(n,p); sort2(r,t); sort2(s,u); sort2(v,w); \
    sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
    sort2(i,m); sort2(j,n); sort2(k,o); sort2(l,p); \
    sort2(q,v); sort2(r,w); \
    sort2(b,k); sort2(c,j); sort2(d,l); sort2(g,t); \
    sort2(m,r); sort2(o,w); sort2(q,s); sort2(u,v); \
    sort2(a,q); sort2(b,c); sort2(d,v); sort2(e,r); \
    sort2(f,o); sort2(g,n); sort2(h,w); sort2(j,s); \
    sort2(k,u); sort2(p,t); \
    sort2(b,k); sort2(c,j); sort2(d,r); sort2(e,m); \
    sort2(f,s); sort2(g,u); sort2(h,p); sort2(i,q); \
    sort2(l,o); sort2(n,v); sort2(t,w); \
    sort2(a,i); sort2(b,e); sort2(c,k); sort2(d,j); \
    sort2(f,g); sort2(l,v); sort2(m,q); sort2(n,u); \
    sort2(o,p); sort2(r,s); \
    sort2(c,i); sort2(d,f); sort2(e,m); sort2(g,j); \
    sort2(h,l); sort2(k,q); sort2(n,r); sort2(p,v); \
    sort2(s,u); \
    sort2(b,c); sort2(e,i); sort2(f,k); sort2(g,m); \
    sort2(h,n); sort2(j,q); sort2(l,s); sort2(o,r); \
    sort2(p,t); \
    sort2(c,e); sort2(d,f); sort2(g,i); sort2(h,j); \
    sort2(k,m); sort2(l,n); sort2(o,q); sort2(p,u); \
    sort2(r,s); sort2(t,v); \
    sort2(d,g); sort2(f,i); sort2(h,k); sort2(j,m); \
    sort2(l,o); sort2(n,q); sort2(p,r); sort2(s,u); \
    sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
    sort2(l,m); sort2(n,o); sort2(p,q); sort2(r,s); \
    sort2(t,u); \
} while (0)

#define sort24(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x) do { \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
    sort2(m,n); sort2(o,p); sort2(q,r); sort2(s,t); sort2(u,v); sort2(w,x); \
    sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); sort2(i,k); sort2(j,l); \
    sort2(m,o); sort2(n,p); sort2(q,s); sort2(r,t); sort2(u,w); sort2(v,x); \
    sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
    sort2(i,m); sort2(j,n); sort2(k,o); sort2(l,p); \
    sort2(q,u); sort2(r,v); sort2(s,w); sort2(t,x); \
    sort2(a,q); sort2(b,s); sort2(c,r); sort2(d,t); \
    sort2(e,u); sort2(f,w); sort2(g,v); sort2(h,x); \
    sort2(j,k); sort2(n,o); \
    sort2(c,k); sort2(d,l); sort2(f,s); sort2(g,o); \
    sort2(h,p); sort2(i,q); sort2(j,r); sort2(m,u); \
    sort2(n,v); \
    sort2(a,i); sort2(b,j); sort2(c,m); sort2(d,u); \
    sort2(e,q); sort2(f,n); sort2(g,r); sort2(h,t); \
    sort2(k,s); sort2(l,v); sort2(o,w); sort2(p,x); \
    sort2(b,i); sort2(d,q); sort2(e,m); sort2(f,k); \
    sort2(g,j); sort2(h,u); sort2(l,t); sort2(n,s); \
    sort2(o,r); sort2(p,w); \
    sort2(c,e); sort2(d,f); sort2(h,n); sort2(j,m); \
    sort2(k,q); sort2(l,o); sort2(s,u); sort2(t,v); \
    sort2(b,c); sort2(e,i); sort2(f,j); sort2(g,k); \
    sort2(h,l); sort2(m,q); sort2(n,r); sort2(o,s); \
    sort2(p,t); sort2(v,w); \
    sort2(c,e); sort2(d,i); sort2(f,g); sort2(h,j); \
    sort2(k,m); sort2(l,n); sort2(o,q); sort2(p,u); \
    sort2(r,s); sort2(t,v); \
    sort2(d,f); sort2(g,i); sort2(h,k); sort2(j,m); \
    sort2(l,o); sort2(n,q); sort2(p,r); sort2(s,u); \
    sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
    sort2(l,m); sort2(n,o); sort2(p,q); sort2(r,s); sort2(t,u); \
} while (0)

void sorting_network(TYPE* l, int partsz_min_1) {
    switch (partsz_min_1) {
    case 23:
        sort24(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18], l[19], l[20], l[21], l[22], l[23]);
        break;
    case 22:
        sort23(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18], l[19], l[20], l[21], l[22]);
        break;
    case 21:
        sort22(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18], l[19], l[20], l[21]);
        break;
    case 20:
        sort21(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18], l[19], l[20]);
        break;
    case 19:
        sort20(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18], l[19]);
        break;
    case 18:
        sort19(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18]);
        break;
    case 17:
        sort18(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15], l[16], l[17]);
        break;
    case 16:
        sort17(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15], l[16]);
        break;
    case 15:
        sort16(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15]);
        break;
    case 14:
        sort15(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14]);
        break;
    case 13:
        sort14(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13]);
        break;
    case 12:
        sort13(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12]);
            break;
    case 11:
        sort12(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11]);
        break;
    case 10:
        sort11(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9], l[10]);
        break;
    case 9:
        sort10(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9]);
        break;
    case 8:
        sort9(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8]);
        break;
    case 7:
        sort8(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7]);
        break;
    case 6:
        sort7(l[0], l[1], l[2], l[3], l[4], l[5], l[6]);
        break;
    case 5:
        sort6(l[0], l[1], l[2], l[3], l[4], l[5]);
        break;
    case 4:
        sort5(l[0], l[1], l[2], l[3], l[4]);
        break;
    case 3:
        sort4(l[0], l[1], l[2], l[3]);
        break;
    case 2:
        sort3(l[0], l[1], l[2]);
        break;
    case 1:
        sort2(l[0], l[1]);
        break;
    case 0:
        break;
    }
}
// ---------------------- partition ----------------------------

#define med5(a, b, c, d, e) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); sort2(c, e); \
    sort2(b, c); \
} while(0)

TYPE* partition_small(TYPE* restrict  left, TYPE* restrict  right) {
    TYPE* outerleft = left;

    TYPE* pivp = left + (right - left) / 2;

    med5(left[1], left[2], *pivp, right[-1], *right);
    left += 3;
    right -= 2;

    TYPE piv = *pivp;
    *pivp = *outerleft;

    TYPE _Alignas(64) swbuf[SMALLPART];
    TYPE* sw = swbuf;
    TYPE* left0 = left;
    while (left <= right) {
        unsigned h = IS_LOWER(*left, piv);
        *left0 = *sw = *left++;
        left0 += h;
        sw += !h;
    }
    memcpy(left0, swbuf, (sw - swbuf) * sizeof(TYPE));
    left0 -= 1;
    *outerleft = *left0;
    *left0 = piv;
    return left0;
}

TYPE* partition(TYPE* restrict left, TYPE* restrict right) {

    TYPE* outerleft = left;
    int n = right - left;
    TYPE* pivp = left + n / 2;
    TYPE piv;

    TYPE* q1 = left + n / 4;
    TYPE* q3 = pivp + n / 4;

    left += 1;
    med5(left[0], left[1], left[2], left[3], left[4]);
    med5(q1[-2], q1[-1], q1[0], q1[1], q1[2]);
    med5(pivp[-2], pivp[-1], pivp[0], pivp[1], pivp[2]);
    med5(q3[-2], q3[-1], q3[0], q3[1], q3[2]);
    med5(right[-4], right[-3], right[-2], right[-1], right[0]);
    med5(left[2], q1[0], pivp[0], q3[0], right[-2]);

    piv = *pivp;
    *pivp = *outerleft;

    #define SWSZ 1024
    TYPE _Alignas(64) swbuf[SWSZ];

    TYPE* left0 = left;
    TYPE* right0 = right;

    TYPE* sw = swbuf;
    #define UNROLL 16
    while (sw < swbuf + SWSZ - UNROLL && left <= right - UNROLL) {
        for (int i = UNROLL; i--;) {
            unsigned h = IS_LOWER(*right, piv); *right0 = *sw = *right--; right0 -= !h; sw += h;
        }
    }
    while (sw < swbuf + SWSZ - UNROLL && left <= right) {
        unsigned h = IS_LOWER(*right, piv); *right0 = *sw = *right--; right0 -= !h; sw += h;
    }
    while (1) {
        while (right0 > right + UNROLL && left <= right - UNROLL) {
            for (int i = UNROLL; i--;) {
                unsigned h = IS_LOWER(*left, piv); *left0 = *right0 = *left++; left0 += h; right0 -= !h;
            }
        }
        while (left0 < left - UNROLL && left <= right - UNROLL) {
            for (int i = UNROLL; i--;) {
                unsigned h = IS_LOWER(*right, piv); *right0 = *left0 = *right--; right0 -= !h; left0 += h;
            }
        }
        if (left > right - UNROLL) break;
    }
    while (right0 > right && left <= right) {
        unsigned h = IS_LOWER(*left, piv); *left0 = *right0 = *left++; left0 += h; right0 -= !h;
    }
    while (left <= right) {
        unsigned h = IS_LOWER(*right, piv); *right0 = *left0 = *right--; right0 -= !h; left0 += h;
    }
    memcpy(left0, swbuf, (sw - swbuf) * sizeof(TYPE));

    *outerleft = *right0;
    *right0 = piv;
    return right0;
}

// ---------------------- heapsort, testing and main -----------------------------------

void heap_sort(TYPE* restrict left, TYPE* restrict right) {

    long n = right - left + 1;
    if (n < 2) return;
    for (long i = n / 2, j, k; ; ) {
        if (i > 0) {
            k = left[--i];
        }
        else {
            n -= 1;
            if (n == 0) return;
            k = left[n];
            left[n] = left[0];
        }
        j = i;
        while (j * 2 + 1 < n) {
            long child = j * 2 + 1;
            if (child + 1 < n && IS_LOWER(left[child], left[child + 1])) child++;
            if (!IS_LOWER(k, left[child])) break;
            left[j] = left[child];
            j = child;
        }
        left[j] = k;
    }
}

unsigned chksum;
unsigned hash_el(void *p, int sz) {
    unsigned char* c = (unsigned char*)p;
    unsigned h = 0;
    for (int i = 0; i < sz; i++) h = (h * 131) + c[i];
    return h;
}
void init(TYPE* data, int len) {
    chksum = 0;
    for (int i = 0; i < len; i++) {
        data[i] = rand();
        chksum += hash_el(&data[i], sizeof(TYPE));
    }
}
void test(TYPE* data, int len) {
    unsigned chks = hash_el(&data[0], sizeof(TYPE));;
    for (int i = 1; i < len; i++) {
        if (data[i] < data[i - 1]) {
            printf("ERROR ORDER\n");
            break;
        }
        chks += hash_el(&data[i], sizeof(TYPE));
    }
    if (chks != chksum) printf("ERROR CHKS\n");
}
double t(void) {

    static double t0;
    struct timeval tv;
    gettimeofday(&tv, NULL);
    double h = t0;
    t0 = tv.tv_sec + tv.tv_usec / 1000000.0;
    return t0 - h;
}

#define SIZE (50 * 1000000)
TYPE data[SIZE];

int main(void) {
    srand(time(NULL));
    init(data, SIZE);
    printf("Sorting %d million numbers " NAME " ...\n", SIZE / 1000000);
    t();
    sort(data, SIZE);
    printf("%.3fs\n", t());
    test(data, SIZE);

    return 0;
}
The measurements were taken on an Apple M1 system with the -O2 compiler option.

Sorting 50 million numbers with Threaded Branchless Quicksort ...
0.236s
Multithreading and the avoidance of branch mispredictions can significantly improve the application performance of programs.


Interactive sorting demo


christof.kaser@gmail.com