A Fast Quicksort in C with Branch‑Avoidant Programming

On modern CPUs, avoiding branch misprediction is an effective way to speed up programs.

When ‘if’ slows you down, avoid it

Quicksort in C - Performance Comparison

Performance results naturally depend on the underlying hardware. The following benchmarks show the execution times for sorting 50 million integers using different sorting implementations. The measurements were taken on an Apple M1 system and on an Intel Xeon (E5-2680) system with the -O3 compiler option.

Implementation Apple M1 Intel Xeon
Baseline Quicksort 3.234s 6.792s
Branch-Avoidant 1.410s 3.228s
std::sort (C++) 1.190s 4.949s
Branch-Avoidant Optimized 0.870s 1.724s

Baseline Quicksort

This is a simple iterative quicksort written in C using a Lomuto‑style partition.

The pivot is placed directly into its final position during partitioning, and the algorithm always continues with the larger part while storing the smaller one on a manual stack.

To avoid the O(n²) runtime caused by many duplicates, the program switches to heap sort for a segment if it detects a significant bias in the partitioning.

// SPDX-License-Identifier: MIT
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int
#define swap(a, b) do { __typeof__(a) _h = a; a = b; b = _h; } while (0)

#define NAME "Baseline Quicksort"

TYPE* partition(TYPE* left, TYPE* right);
void heap_sort(TYPE* left, TYPE* right);

void sort(TYPE* data, int len) {

    TYPE* left = data;
    TYPE* stack[64];    // for max 2^32 items
    unsigned sp = 0;
    TYPE* right = data + len - 1;

    while (1) {
        long partsz = right - left;
        if (partsz <= 0) goto pop_stack;

        TYPE* mid = partition(left, right);

        if ((mid - left) * 16 < partsz) {
            heap_sort(left, right);
            goto pop_stack;
        }
        if (mid < left + partsz / 2) {
            stack[sp] = mid + 1;
            stack[sp + 1] = right;
            right = mid - 1;
        }
        else {
            stack[sp] = left;
            stack[sp + 1] = mid - 1;
            left = mid + 1;
        }
        sp += 2;
        continue;

    pop_stack:
        if (sp == 0) break;
        sp -= 2;
        left = stack[sp];
        right = stack[sp + 1];
    }
}

// ---------------------- partition ----------------------------

TYPE* partition(TYPE* left, TYPE* right)
{
    TYPE* outerleft = left;
    TYPE* pivp = left + (right - left) / 2;

    TYPE piv = *pivp;
    *pivp = *outerleft;
    TYPE* mid = left;
    left += 1;

    while (left <= right) {
        if (IS_LOWER(*left, piv)) {
            mid++;
            swap(*mid, *left);
        }
        left++;
    }
    *outerleft = *mid;
    *mid = piv;
    return mid;
}

// ---------------------- heap_sort, testing and main -----------------------------------

void heap_sort(TYPE* left, TYPE* right) {

    long n = right - left + 1;
    if (n < 2) return;
    for (long i = n / 2, j, k; ; ) {
        if (i > 0) {
            k = left[--i];
        }
        else {
            n -= 1;
            if (n == 0) return;
            k = left[n];
            left[n] = left[0];
        }
        j = i;
        while (j * 2 + 1 < n) {
            long child = j * 2 + 1;
            if (child + 1 < n && IS_LOWER(left[child], left[child + 1])) child++;
            if (!IS_LOWER(k, left[child])) break;
            left[j] = left[child];
            j = child;
        }
        left[j] = k;
    }
}

// ---------------------------------------------------------

unsigned chksum;
unsigned hash_el(void *p, int sz) {

    unsigned char* c = (unsigned char*)p;
    unsigned h = 0;
    for (int i = 0; i < sz; i++) h = (h * 131) + c[i];
    return h;
}

void init(TYPE* data, int len) {

    chksum = 0;
    for (int i = 0; i < len; i++) {
        data[i] = rand();
        chksum += hash_el(&data[i], sizeof(TYPE));
    }
}

void test(TYPE* data, int len) {

    unsigned chks = hash_el(&data[0], sizeof(TYPE));;
    for (int i = 1; i < len; i++) {
        if (data[i] < data[i - 1]) {
            printf("ERROR ORDER\n");
            break;
        }
        chks += hash_el(&data[i], sizeof(TYPE));
    }
    if (chks != chksum) printf("ERROR CHKS\n");
}

double t(void) {

    static double t0;
    struct timeval tv;
    gettimeofday(&tv, NULL);
    double h = t0;
    t0 = tv.tv_sec + tv.tv_usec / 1000000.0;
    return t0 - h;
}

// ---------------------------------------------------------

#define SIZE (50 * 1000000)
TYPE data[SIZE];

int main(void) {

    init(data, SIZE);
    printf("Sorting %d million numbers with " NAME " ...\n", SIZE / 1000000);
    t();
    sort(data, SIZE);
    printf("%.3fs\n", t());
    test(data, SIZE);
    return 0;
}

Sorting 50 million numbers with Baseline Quicksort ...
3.234s

C++ std::sort

For comparison C++ std::sort:

#include <algorithm>
#include <sys/time.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int

#define NAME "C++ std::sort"

void sort(TYPE* data, int len) {
    std::sort(data, data + len);
}

// ---------------------- testing and main -----------------------------------
.
.
Sorting 50 million numbers with std::sort ...
1.190s

Branch-Avoidant

The paper https://arxiv.org/abs/1604.06697 by Edelkamp and A. Weiß shows how partitioning performance in Quicksort can be improved by avoiding conditional branches.

The idea of using auxiliary memory for branch-avoidant partitioning is inspired by https://github.com/scandum/fluxsort .

A small sorting network was used and built on a branchless structure.

// SPDX-License-Identifier: MIT
// (c) 2026 christof.kaser@gmail.com
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#include <stddef.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int

#define NAME "Branch-Avoidant Quicksort"

TYPE* partition(TYPE* left, TYPE* right);
void sorting_network(TYPE* left, int size);
void heap_sort(TYPE* left, TYPE* right);
TYPE* partition(TYPE* left, TYPE* right);

void sort(TYPE* data, int len) {

    TYPE* left = data;
    TYPE* stack[80];    // for max 2^40 items
    unsigned sp = 0;
    TYPE* right = data + len - 1;

    while (1) {
        ptrdiff_t partsz = right - left;
        if (partsz <= 3) {
            sorting_network(left, partsz);
            goto pop_stack;
        }
        TYPE* mid = partition(left, right);

        if ((mid - left) * 16 < partsz) {
            heap_sort(left, right);
            goto pop_stack;
        }
        if (mid < left + partsz / 2) {
            stack[sp] = mid + 1;
            stack[sp + 1] = right;
            right = mid - 1;
        }
        else {
            stack[sp] = left;
            stack[sp + 1] = mid - 1;
            left = mid + 1;
        }
        sp += 2;
        continue;

    pop_stack:
        if (sp == 0) break;
        sp -= 2;
        left = stack[sp];
        right = stack[sp + 1];
    }
}

// ---------------------- sorting network ----------------------------

#define sort2(a, b) do {  \
    __typeof__(a) x = a; \
    __typeof__(a) y = b; \
    unsigned m = IS_LOWER(x, y); \
    a = m ? x : y;  \
    b = m ? y : x;  \
} while(0)

#define sort3(a, b, c) do { \
    sort2(a, b); sort2(b, c); sort2(a, b); \
} while(0)

#define sort4(a, b, c, d) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); \
} while(0)

void sorting_network(TYPE* l, int partsz_min_1) {
    // partsz_min_1 == right - left
    switch (partsz_min_1) {
    case 3:
        sort4(l[0], l[1], l[2], l[3]);
        break;
    case 2:
        sort3(l[0], l[1], l[2]);
        break;
    case 1:
        sort2(l[0], l[1]);
        break;
    case 0:
        break;
    }
}
// ---------------------- partition ----------------------------

TYPE* partition(TYPE* restrict left, TYPE* restrict right) {

    TYPE* outerleft = left;
    TYPE* pivp = left + (right - left) / 2;

    sort3(left[1], *pivp, *right);
    left += 2;
    right -= 1;

    TYPE piv = *pivp;
    *pivp = *outerleft;

    #define SWSZ 1024
    TYPE _Alignas(64) swbuf[SWSZ];

    TYPE* left0 = left;
    TYPE* right0 = right;

    TYPE* sw = swbuf;
    #define UNROLL 16

    while (sw < swbuf + SWSZ - UNROLL && left <= right) {
        unsigned h = IS_LOWER(*right, piv);
        *right0 = *sw = *right--;
        right0 -= !h;
        sw += h;
    }
    while (1) {
        while (right0 > right && left <= right) {
            unsigned h = IS_LOWER(*left, piv);
            *left0 = *right0 = *left++;
            left0 += h;
            right0 -= !h;
        }
        while (left0 < left && left <= right) {
            unsigned h = IS_LOWER(*right, piv);
            *right0 = *left0 = *right--;
            right0 -= !h;
            left0 += h;
        }
        if (left > right) break;
    }
    memcpy(left0, swbuf, (sw - swbuf) * sizeof(TYPE));

    *outerleft = *right0;
    *right0 = piv;
    return right0;
}

// ---------------------- heapsort, testing and main -----------------------------------
.
.
.
Sorting 50 million numbers with Branch-Avoidant Quicksort ...
1.396s

Branch-Avoidant optimized with a sorting network up to 24

This variant handles small partitions of up to 24 elements using sorting networks, so these ranges are sorted directly instead of being partitioned further.

For larger parts of the array, the algorithm continues with branch‑avoidant partitioning and improved pivot selection.

// SPDX-License-Identifier: MIT
// (c) 2026 christof.kaser@gmail.com
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#include <stddef.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int

#define NAME "Optimized Branch-Avoidant Quicksort"

TYPE* partition(TYPE* left, TYPE* right);
void sorting_network(TYPE* left, int size);
void heap_sort(TYPE* left, TYPE* right);
TYPE* partition_small(TYPE* left, TYPE* right);

#define SMALLPART 1024

void sort(TYPE* data, int len) {

    TYPE* left = data;
    TYPE* stack[80];    // for max 2^40 items
    unsigned sp = 0;
    TYPE* right = data + len - 1;

    while (1) {
        ptrdiff_t partsz = right - left;
        if (partsz <= 23) {
            sorting_network(left, partsz);
            goto pop_stack;
        }
        TYPE* mid;
        if (partsz <= SMALLPART) mid = partition_small(left, right);
        else mid = partition(left, right);

        if (partsz > SMALLPART && (mid - left) * 16 < partsz) {
            heap_sort(left, right);
            goto pop_stack;
        }
        if (mid < left + partsz / 2) {
            stack[sp] = mid + 1;
            stack[sp + 1] = right;
            right = mid - 1;
        }
        else {
            stack[sp] = left;
            stack[sp + 1] = mid - 1;
            left = mid + 1;
        }
        sp += 2;
        continue;

    pop_stack:
        if (sp == 0) break;
        sp -= 2;
        left = stack[sp];
        right = stack[sp + 1];
    }
}

// ---------------------- sorting network ----------------------------

#define sort2(a, b) do {  \
    __typeof__(a) x = a; \
    __typeof__(a) y = b; \
    unsigned m = IS_LOWER(x, y); \
    a = m ? x : y;  \
    b = m ? y : x;  \
} while(0)

// https://bertdobbelaere.github.io/sorting_networks.html

#define sort3(a, b, c) do { \
    sort2(a, b); sort2(b, c); sort2(a, b); \
} while(0)

#define sort4(a, b, c, d) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); \
} while(0)

#define sort5(a, b, c, d, e) do { \
    sort2(b, c); sort2(d, e); sort2(b, d); \
    sort2(a, c); sort2(a, d); sort2(c, e); \
    sort2(a, b); sort2(c, d); sort2(b, c); \
} while(0)

#define sort6(a, b, c, d, e, f) do { \
    sort2(a, b); sort2(c, d); sort2(e, f); \
    sort2(a, c); sort2(b, d); sort2(e, f); \
    sort2(a, e); sort2(b, f); sort2(c, e); \
    sort2(d, f); sort2(b, c); sort2(d, e); \
    sort2(c, d); \
} while(0)

#define sort7(a, b, c, d, e, f, g) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); sort2(e, f); \
    sort2(e, g); sort2(f, g); sort2(a, e); \
    sort2(b, f); sort2(c, g); sort2(b, e); \
    sort2(d, g); sort2(c, e); sort2(b, c); \
    sort2(d, f); sort2(d, e); \
} while(0)

#define sort8(a,b,c,d,e,f,g,h) do { \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); \
    sort2(b,c); sort2(f,g); \
    sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
    sort2(c,e); sort2(d,f); \
    sort2(b,c); sort2(d,e); sort2(f,g); \
} while (0)

#define sort9(a,b,c,d,e,f,g,h,i) do { \
    sort2(a,d); sort2(b,h); sort2(c,f); sort2(e,i); \
    sort2(a,h); sort2(c,e); sort2(d,i); sort2(f,g); \
    sort2(a,c); sort2(b,d); sort2(e,f); sort2(h,i); \
    sort2(b,e); sort2(d,g); sort2(f,h); \
    sort2(a,b); sort2(c,e); sort2(d,f); sort2(g,i); \
    sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(b,c); sort2(d,e); sort2(f,g); \
} while (0)

#define sort10(a,b,c,d,e,f,g,h,i,j) do { \
    sort2(a,i); sort2(b,j); sort2(c,h); sort2(d,f); sort2(e,g); \
    sort2(a,c); sort2(b,e); sort2(f,i); sort2(h,j); \
    sort2(a,d); sort2(c,e); sort2(f,h); sort2(g,j); \
    sort2(a,b); sort2(d,g); sort2(i,j); \
    sort2(b,f); sort2(c,d); sort2(e,i); sort2(g,h); \
    sort2(b,c); sort2(d,f); sort2(e,g); sort2(h,i); \
    sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(d,e); sort2(f,g); \
} while (0)

#define sort11(a,b,c,d,e,f,g,h,i,j,k) do { \
    sort2(a,j); sort2(b,g); sort2(c,e); sort2(d,h); sort2(f,i); \
    sort2(a,b); sort2(d,f); sort2(e,k); sort2(g,j); sort2(h,i); \
    sort2(b,d); sort2(c,f); sort2(e,h); sort2(i,k); \
    sort2(a,e); sort2(b,c); sort2(d,h); sort2(f,j); sort2(g,i); \
    sort2(a,b); sort2(c,g); sort2(e,f); sort2(h,i); sort2(j,k); \
    sort2(c,e); sort2(d,g); sort2(f,h); sort2(i,j); \
    sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); \
    sort2(c,d); sort2(e,f); sort2(g,h); \
} while (0)

#define sort12(a,b,c,d,e,f,g,h,i,j,k,l) do { \
    sort2(a,i); sort2(b,h); sort2(c,g); sort2(d,l); sort2(e,k); sort2(f,j); \
    sort2(a,c); sort2(b,e); sort2(d,f); sort2(g,i); sort2(h,k); sort2(j,l); \
    sort2(a,b); sort2(c,j); sort2(e,h); sort2(f,g); sort2(k,l); \
    sort2(b,d); sort2(c,h); sort2(e,j); sort2(i,k); \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
    sort2(b,c); sort2(d,f); sort2(g,i); sort2(j,k); \
    sort2(c,e); sort2(d,g); sort2(f,i); sort2(h,j); \
    sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
} while (0)

#define sort13(a,b,c,d,e,f,g,h,i,j,k,l,m) do { \
    sort2(a,l); sort2(b,h); sort2(c,e); sort2(d,f); sort2(i,j); sort2(k,m); \
    sort2(a,c); sort2(d,g); sort2(e,m); sort2(f,h); sort2(i,k); \
    sort2(a,i); sort2(b,d); sort2(c,f); sort2(e,j); sort2(g,l); sort2(h,m); \
    sort2(a,b); sort2(c,k); sort2(d,i); sort2(e,g); sort2(j,l); \
    sort2(b,d); sort2(c,e); sort2(f,k); sort2(g,i); sort2(h,j); sort2(l,m); \
    sort2(b,c); sort2(d,e); sort2(f,i); sort2(g,j); sort2(h,k); \
    sort2(c,d); sort2(e,h); sort2(f,g); sort2(i,l); sort2(j,k); \
    sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
    sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
} while (0)

#define sort14(a,b,c,d,e,f,g,h,i,j,k,l,m,n) do { \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(i,j); sort2(k,l); sort2(m,n); \
    sort2(a,c); sort2(b,d); sort2(e,i); sort2(f,j); \
    sort2(k,m); sort2(l,n); \
    sort2(a,k); sort2(b,g); sort2(c,l); sort2(d,n); \
    sort2(f,i); sort2(h,m); \
    sort2(b,e); sort2(c,i); sort2(d,g); \
    sort2(f,l); sort2(h,k); sort2(j,m); \
    sort2(a,b); sort2(d,j); sort2(e,k); \
    sort2(f,h); sort2(g,i); sort2(m,n); \
    sort2(b,f); sort2(c,e); sort2(d,h); \
    sort2(g,k); sort2(i,m); sort2(j,l); \
    sort2(b,c); sort2(d,f); sort2(e,g); \
    sort2(h,j); sort2(i,k); sort2(l,m); \
    sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(i,j); sort2(k,l); \
    sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
} while (0)

#define sort15(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) do { \
    sort2(a,g); sort2(b,k); sort2(c,o); sort2(d,j); \
    sort2(e,m); sort2(f,n); sort2(h,l); \
    sort2(a,h); sort2(c,f); sort2(d,e); \
    sort2(g,l); sort2(i,k); sort2(j,m); sort2(n,o); \
    sort2(b,n); sort2(c,d); sort2(e,g); \
    sort2(f,j); sort2(h,i); sort2(k,o); sort2(l,m); \
    sort2(a,d); sort2(b,e); sort2(f,h); \
    sort2(g,n); sort2(i,j); sort2(k,l); sort2(m,o); \
    sort2(a,c); sort2(b,f); sort2(d,i); \
    sort2(e,g); sort2(h,k); sort2(j,l); sort2(m,n); \
    sort2(a,b); sort2(c,f); sort2(d,k); \
    sort2(e,i); sort2(g,h); sort2(j,m); sort2(l,n); \
    sort2(b,c); sort2(d,e); sort2(f,g); \
    sort2(h,j); sort2(i,k); sort2(l,m); \
    sort2(d,f); sort2(e,g); sort2(h,i); sort2(j,k); \
    sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(i,j); sort2(k,l); \
} while (0)

#define sort16(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) do { \
    sort2(a,f); sort2(b,e); sort2(c,m); sort2(d,n); \
    sort2(g,h); sort2(i,j); sort2(k,p); sort2(l,o); \
    sort2(a,c); sort2(b,k); sort2(d,g); sort2(e,h); \
    sort2(f,o); sort2(i,l); sort2(j,m); sort2(n,p); \
    sort2(a,i); sort2(b,d); sort2(c,l); sort2(e,n); \
    sort2(f,j); sort2(g,k); sort2(h,p); sort2(m,o); \
    sort2(a,b); sort2(c,e); sort2(d,i); sort2(f,g); \
    sort2(h,m); sort2(j,k); sort2(l,n); sort2(o,p); \
    sort2(b,d); sort2(c,f); sort2(e,i); sort2(g,j); \
    sort2(h,l); sort2(k,n); sort2(m,o); \
    sort2(b,c); sort2(d,f); sort2(e,l); sort2(g,i); \
    sort2(h,j); sort2(k,m); sort2(n,o); \
    sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); \
    sort2(k,l); sort2(m,n); \
    sort2(e,g); sort2(f,h); sort2(i,k); sort2(j,l); \
    sort2(d,e); sort2(f,g); sort2(h,i); \
    sort2(j,k); sort2(l,m); \
} while (0)

#define sort17(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q) do { \
    sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); \
    sort2(j,k); sort2(l,m); sort2(n,o); sort2(p,q); \
    sort2(b,d); sort2(c,e); sort2(f,h); sort2(g,i); \
    sort2(j,l); sort2(k,m); sort2(n,p); sort2(o,q); \
    sort2(b,f); sort2(c,g); sort2(d,h); sort2(e,i); \
    sort2(j,n); sort2(k,o); sort2(l,p); sort2(m,q); \
    sort2(a,d); sort2(b,n); sort2(c,k); sort2(e,h); \
    sort2(f,l); sort2(g,m); sort2(i,j); sort2(o,p); \
    sort2(a,n); sort2(b,i); sort2(c,f); sort2(d,g); \
    sort2(e,o); sort2(h,p); sort2(j,q); sort2(k,l); \
    sort2(a,b); sort2(c,i); sort2(d,e); sort2(f,k); \
    sort2(g,n); sort2(h,l); sort2(m,o); \
    sort2(b,f); sort2(d,i); sort2(e,k); sort2(g,h); \
    sort2(j,m); sort2(l,n); \
    sort2(b,c); sort2(e,g); sort2(f,i); sort2(h,k); \
    sort2(j,l); sort2(m,o); sort2(n,p); \
    sort2(c,d); sort2(e,f); sort2(g,i); sort2(h,j); \
    sort2(k,l); sort2(m,n); sort2(o,p); \
    sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
    sort2(l,m); sort2(n,o); sort2(p,q); \
} while (0)

#define sort18(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r) do { \
    sort2(a,g); sort2(b,k); sort2(c,p); sort2(d,f); sort2(e,j); \
    sort2(h,q); sort2(i,n); sort2(l,r); sort2(m,o); \
    sort2(a,m); sort2(b,e); sort2(d,l); sort2(f,r); sort2(g,o); \
    sort2(h,i); sort2(j,k); sort2(n,q); \
    sort2(b,n); sort2(c,h); sort2(e,q); sort2(g,j); \
    sort2(i,l); sort2(k,p); \
    sort2(a,b); sort2(c,d); sort2(e,m); sort2(f,n); \
    sort2(h,j); sort2(i,k); sort2(o,p); sort2(q,r); \
    sort2(a,c); sort2(b,l); sort2(d,e); sort2(f,h); \
    sort2(g,q); sort2(k,m); sort2(n,o); sort2(p,r); \
    sort2(b,i); sort2(e,k); sort2(f,g); sort2(h,n); \
    sort2(j,q); sort2(l,m); \
    sort2(b,d); sort2(c,f); sort2(e,h); sort2(g,i); \
    sort2(j,l); sort2(k,n); sort2(m,p); sort2(o,q); \
    sort2(b,c); sort2(d,f); sort2(e,g); sort2(h,j); \
    sort2(i,k); sort2(l,n); sort2(m,o); sort2(p,q); \
    sort2(c,d); sort2(f,i); sort2(g,h); sort2(j,m); \
    sort2(k,l); sort2(o,p); \
    sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
    sort2(l,m); sort2(n,o); \
    sort2(e,f); sort2(g,h); sort2(i,j); \
    sort2(k,l); sort2(m,n); \
} while (0)
#define sort19(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s) do { \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(i,k); sort2(l,m); sort2(n,o); sort2(p,q); sort2(r,s); \
    sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); \
    sort2(i,j); sort2(l,n); sort2(m,o); sort2(p,r); sort2(q,s); \
    sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
    sort2(j,k); sort2(l,p); sort2(m,q); sort2(n,r); sort2(o,s); \
    sort2(a,l); sort2(b,i); sort2(c,n); sort2(d,r); \
    sort2(e,k); sort2(f,g); sort2(j,q); sort2(m,p); \
    sort2(b,c); sort2(d,n); sort2(e,m); sort2(f,o); \
    sort2(g,q); sort2(h,k); sort2(i,p); \
    sort2(a,b); sort2(c,l); sort2(d,j); sort2(f,m); \
    sort2(g,p); sort2(h,n); sort2(k,s); sort2(o,r); \
    sort2(b,e); sort2(d,i); sort2(f,l); sort2(g,j); \
    sort2(h,m); sort2(k,n); sort2(o,p); sort2(q,r); \
    sort2(c,e); sort2(d,f); sort2(g,h); \
    sort2(i,l); sort2(j,m); sort2(k,o); sort2(n,p); \
    sort2(c,d); sort2(e,f); sort2(g,i); sort2(h,j); \
    sort2(k,l); sort2(m,o); sort2(n,q); sort2(p,r); \
    sort2(b,c); sort2(e,g); sort2(f,i); sort2(h,k); \
    sort2(j,l); sort2(m,n); sort2(o,q); \
    sort2(d,e); sort2(f,g); sort2(h,i); \
    sort2(j,k); sort2(l,m); sort2(n,o); sort2(p,q); \
} while (0)

#define sort20(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t) do { \
    sort2(a,m); sort2(b,n); sort2(c,o); sort2(d,p); sort2(e,q); \
    sort2(f,r); sort2(g,s); sort2(h,t); sort2(i,k); sort2(j,l); \
    sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); \
    sort2(i,j); sort2(k,l); sort2(m,o); sort2(n,p); sort2(q,s); sort2(r,t); \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(m,n); sort2(o,p); sort2(q,r); sort2(s,t); \
    sort2(a,e); sort2(b,m); sort2(c,q); sort2(d,r); \
    sort2(f,i); sort2(g,j); sort2(h,s); \
    sort2(k,n); sort2(l,o); sort2(p,t); \
    sort2(b,g); sort2(d,k); sort2(e,f); sort2(h,l); \
    sort2(i,m); sort2(j,q); sort2(n,s); sort2(o,p); \
    sort2(a,e); sort2(c,i); sort2(d,j); sort2(g,h); \
    sort2(k,q); sort2(l,r); sort2(m,n); sort2(p,t); \
    sort2(b,e); sort2(d,g); sort2(f,i); sort2(h,k); \
    sort2(j,m); sort2(l,o); sort2(n,q); sort2(p,s); \
    sort2(c,d); sort2(e,f); sort2(g,i); sort2(h,j); \
    sort2(k,m); sort2(l,n); sort2(o,p); sort2(q,r); \
    sort2(c,e); sort2(d,g); sort2(f,h); sort2(i,k); \
    sort2(j,l); sort2(m,o); sort2(n,q); sort2(p,r); \
    sort2(b,c); sort2(d,f); sort2(g,h); sort2(i,j); \
    sort2(k,l); sort2(m,n); sort2(o,q); sort2(r,s); \
    sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
    sort2(l,m); sort2(n,o); sort2(p,q); \
} while (0)

#define sort21(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u) do { \
    sort2(a,h); sort2(b,k); sort2(d,f); sort2(e,i); sort2(g,n); \
    sort2(j,t); sort2(l,o); sort2(m,r); sort2(p,q); sort2(s,u); \
    sort2(a,l); sort2(b,p); sort2(c,m); sort2(d,e); sort2(f,i); \
    sort2(g,j); sort2(h,o); sort2(k,q); sort2(n,t); sort2(r,u); \
    sort2(a,g); sort2(b,d); sort2(c,s); sort2(e,p); sort2(f,k); \
    sort2(i,q); sort2(l,r); sort2(m,n); sort2(o,u); \
    sort2(c,g); sort2(f,m); sort2(h,s); sort2(i,o); \
    sort2(j,l); sort2(k,r); sort2(n,t); sort2(q,u); \
    sort2(b,c); sort2(e,h); sort2(f,j); sort2(g,r); \
    sort2(k,n); sort2(l,m); sort2(o,t); sort2(p,s); \
    sort2(a,c); sort2(d,g); sort2(e,f); sort2(h,k); \
    sort2(i,l); sort2(j,p); sort2(m,q); sort2(n,s); \
    sort2(o,r); sort2(t,u); \
    sort2(a,b); sort2(c,d); sort2(f,j); sort2(g,m); \
    sort2(h,i); sort2(l,o); sort2(n,p); sort2(q,t); sort2(r,s); \
    sort2(b,c); sort2(d,j); sort2(g,n); sort2(k,l); \
    sort2(m,p); sort2(q,r); sort2(s,t); \
    sort2(b,e); sort2(c,f); sort2(d,h); sort2(g,k); \
    sort2(i,j); sort2(l,m); sort2(n,o); sort2(r,s); \
    sort2(c,e); sort2(f,g); sort2(h,i); sort2(j,l); \
    sort2(k,n); sort2(m,p); sort2(o,q); \
    sort2(d,e); sort2(f,h); sort2(g,i); sort2(j,k); \
    sort2(l,n); sort2(m,o); sort2(p,q); \
    sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
    sort2(m,n); sort2(o,p); sort2(q,r); \
} while (0)
#define sort22(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v) do { \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
    sort2(m,n); sort2(o,p); sort2(q,r); sort2(s,t); sort2(u,v); \
    sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); sort2(i,k); sort2(l,n); \
    sort2(o,q); sort2(p,r); sort2(s,u); sort2(t,v); \
    sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); sort2(i,m); sort2(j,n); \
    sort2(o,s); sort2(p,t); sort2(q,u); sort2(r,v); \
    sort2(a,o); sort2(b,p); sort2(c,s); sort2(d,t); sort2(e,q); sort2(f,r); \
    sort2(g,u); sort2(h,v); sort2(j,l); sort2(k,m); \
    sort2(a,i); sort2(c,k); sort2(e,o); sort2(f,m); sort2(g,p); sort2(h,r); \
    sort2(j,q); sort2(l,t); sort2(n,v); \
    sort2(b,j); sort2(c,e); sort2(d,q); sort2(f,s); sort2(g,k); sort2(h,n); \
    sort2(i,o); sort2(l,p); sort2(m,u); sort2(r,t); \
    sort2(b,i); sort2(d,l); sort2(e,f); sort2(h,m); sort2(j,o); sort2(k,s); \
    sort2(n,u); sort2(q,r); \
    sort2(b,c); sort2(d,f); sort2(e,i); sort2(g,j); sort2(h,l); sort2(k,o); \
    sort2(m,p); sort2(n,r); sort2(q,s); sort2(t,u); \
    sort2(c,e); sort2(d,g); sort2(f,j); sort2(h,k); sort2(l,o); sort2(m,q); \
    sort2(p,s); sort2(r,t); \
    sort2(d,e); sort2(f,h); sort2(g,i); sort2(j,l); sort2(k,m); sort2(n,p); \
    sort2(o,q); sort2(r,s); \
    sort2(f,g); sort2(h,i); sort2(j,k); sort2(l,m); sort2(n,o); sort2(p,q); \
    sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); sort2(m,n); sort2(o,p); \
    sort2(q,r); \
} while (0)

#define sort23(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w) do { \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
    sort2(m,n); sort2(o,p); sort2(q,r); sort2(s,t); sort2(u,v); \
    sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); sort2(i,k); sort2(j,l); \
    sort2(m,o); sort2(n,p); sort2(r,t); sort2(s,u); sort2(v,w); \
    sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
    sort2(i,m); sort2(j,n); sort2(k,o); sort2(l,p); \
    sort2(q,v); sort2(r,w); \
    sort2(b,k); sort2(c,j); sort2(d,l); sort2(g,t); \
    sort2(m,r); sort2(o,w); sort2(q,s); sort2(u,v); \
    sort2(a,q); sort2(b,c); sort2(d,v); sort2(e,r); \
    sort2(f,o); sort2(g,n); sort2(h,w); sort2(j,s); \
    sort2(k,u); sort2(p,t); \
    sort2(b,k); sort2(c,j); sort2(d,r); sort2(e,m); \
    sort2(f,s); sort2(g,u); sort2(h,p); sort2(i,q); \
    sort2(l,o); sort2(n,v); sort2(t,w); \
    sort2(a,i); sort2(b,e); sort2(c,k); sort2(d,j); \
    sort2(f,g); sort2(l,v); sort2(m,q); sort2(n,u); \
    sort2(o,p); sort2(r,s); \
    sort2(c,i); sort2(d,f); sort2(e,m); sort2(g,j); \
    sort2(h,l); sort2(k,q); sort2(n,r); sort2(p,v); \
    sort2(s,u); \
    sort2(b,c); sort2(e,i); sort2(f,k); sort2(g,m); \
    sort2(h,n); sort2(j,q); sort2(l,s); sort2(o,r); \
    sort2(p,t); \
    sort2(c,e); sort2(d,f); sort2(g,i); sort2(h,j); \
    sort2(k,m); sort2(l,n); sort2(o,q); sort2(p,u); \
    sort2(r,s); sort2(t,v); \
    sort2(d,g); sort2(f,i); sort2(h,k); sort2(j,m); \
    sort2(l,o); sort2(n,q); sort2(p,r); sort2(s,u); \
    sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
    sort2(l,m); sort2(n,o); sort2(p,q); sort2(r,s); \
    sort2(t,u); \
} while (0)

#define sort24(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x) do { \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
    sort2(m,n); sort2(o,p); sort2(q,r); sort2(s,t); sort2(u,v); sort2(w,x); \
    sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); sort2(i,k); sort2(j,l); \
    sort2(m,o); sort2(n,p); sort2(q,s); sort2(r,t); sort2(u,w); sort2(v,x); \
    sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
    sort2(i,m); sort2(j,n); sort2(k,o); sort2(l,p); \
    sort2(q,u); sort2(r,v); sort2(s,w); sort2(t,x); \
    sort2(a,q); sort2(b,s); sort2(c,r); sort2(d,t); \
    sort2(e,u); sort2(f,w); sort2(g,v); sort2(h,x); \
    sort2(j,k); sort2(n,o); \
    sort2(c,k); sort2(d,l); sort2(f,s); sort2(g,o); \
    sort2(h,p); sort2(i,q); sort2(j,r); sort2(m,u); \
    sort2(n,v); \
    sort2(a,i); sort2(b,j); sort2(c,m); sort2(d,u); \
    sort2(e,q); sort2(f,n); sort2(g,r); sort2(h,t); \
    sort2(k,s); sort2(l,v); sort2(o,w); sort2(p,x); \
    sort2(b,i); sort2(d,q); sort2(e,m); sort2(f,k); \
    sort2(g,j); sort2(h,u); sort2(l,t); sort2(n,s); \
    sort2(o,r); sort2(p,w); \
    sort2(c,e); sort2(d,f); sort2(h,n); sort2(j,m); \
    sort2(k,q); sort2(l,o); sort2(s,u); sort2(t,v); \
    sort2(b,c); sort2(e,i); sort2(f,j); sort2(g,k); \
    sort2(h,l); sort2(m,q); sort2(n,r); sort2(o,s); \
    sort2(p,t); sort2(v,w); \
    sort2(c,e); sort2(d,i); sort2(f,g); sort2(h,j); \
    sort2(k,m); sort2(l,n); sort2(o,q); sort2(p,u); \
    sort2(r,s); sort2(t,v); \
    sort2(d,f); sort2(g,i); sort2(h,k); sort2(j,m); \
    sort2(l,o); sort2(n,q); sort2(p,r); sort2(s,u); \
    sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
    sort2(l,m); sort2(n,o); sort2(p,q); sort2(r,s); sort2(t,u); \
} while (0)

void sorting_network(TYPE* l, int partsz_min_1) {
    // partsz_min_1 == right - left
    switch (partsz_min_1) {
    case 23:
        sort24(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18], l[19], l[20], l[21], l[22], l[23]);
        break;
    case 22:
        sort23(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18], l[19], l[20], l[21], l[22]);
        break;
    case 21:
        sort22(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18], l[19], l[20], l[21]);
        break;
    case 20:
        sort21(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18], l[19], l[20]);
        break;
    case 19:
        sort20(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18], l[19]);
        break;
    case 18:
        sort19(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18]);
        break;
    case 17:
        sort18(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15], l[16], l[17]);
        break;
    case 16:
        sort17(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15], l[16]);
        break;
    case 15:
        sort16(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15]);
        break;
    case 14:
        sort15(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14]);
        break;
    case 13:
        sort14(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13]);
        break;
    case 12:
        sort13(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12]);
            break;
    case 11:
        sort12(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11]);
        break;
    case 10:
        sort11(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9], l[10]);
        break;
    case 9:
        sort10(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9]);
        break;
    case 8:
        sort9(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8]);
        break;
    case 7:
        sort8(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7]);
        break;
    case 6:
        sort7(l[0], l[1], l[2], l[3], l[4], l[5], l[6]);
        break;
    case 5:
        sort6(l[0], l[1], l[2], l[3], l[4], l[5]);
        break;
    case 4:
        sort5(l[0], l[1], l[2], l[3], l[4]);
        break;
    case 3:
        sort4(l[0], l[1], l[2], l[3]);
        break;
    case 2:
        sort3(l[0], l[1], l[2]);
        break;
    case 1:
        sort2(l[0], l[1]);
        break;
    case 0:
        break;
    }
}
// ---------------------- partition ----------------------------

#define med5(a, b, c, d, e) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); sort2(c, e); \
    sort2(b, c); \
} while(0)

TYPE* partition_small(TYPE* restrict  left, TYPE* restrict  right) {
    TYPE* outerleft = left;

    TYPE* pivp = left + (right - left) / 2;

    med5(left[1], left[2], *pivp, right[-1], *right);
    left += 3;
    right -= 2;

    TYPE piv = *pivp;
    *pivp = *outerleft;

    TYPE _Alignas(64) swbuf[SMALLPART];
    TYPE* sw = swbuf;
    TYPE* left0 = left;
    while (left <= right) {
        unsigned h = IS_LOWER(*left, piv);
        *left0 = *sw = *left++;
        left0 += h;
        sw += !h;
    }
    memcpy(left0, swbuf, (sw - swbuf) * sizeof(TYPE));
    left0 -= 1;
    *outerleft = *left0;
    *left0 = piv;
    return left0;
}

TYPE* partition(TYPE* restrict left, TYPE* restrict right) {

    TYPE* outerleft = left;
    int n = right - left;
    TYPE* pivp = left + n / 2;
    TYPE piv;

    TYPE* q1 = left + n / 4;
    TYPE* q3 = pivp + n / 4;

    left += 1;
    med5(left[0], left[1], left[2], left[3], left[4]);
    med5(q1[-2], q1[-1], q1[0], q1[1], q1[2]);
    med5(pivp[-2], pivp[-1], pivp[0], pivp[1], pivp[2]);
    med5(q3[-2], q3[-1], q3[0], q3[1], q3[2]);
    med5(right[-4], right[-3], right[-2], right[-1], right[0]);
    med5(left[2], q1[0], pivp[0], q3[0], right[-2]);

    piv = *pivp;
    *pivp = *outerleft;

    #define SWSZ 1024
    TYPE _Alignas(64) swbuf[SWSZ];

    TYPE* left0 = left;
    TYPE* right0 = right;

    TYPE* sw = swbuf;
    #define UNROLL 16
    while (sw < swbuf + SWSZ - UNROLL && left <= right - UNROLL) {
        for (int i = UNROLL; i--;) {
            unsigned h = IS_LOWER(*right, piv); *right0 = *sw = *right--; right0 -= !h; sw += h;
        }
    }
    while (sw < swbuf + SWSZ - UNROLL && left <= right) {
        unsigned h = IS_LOWER(*right, piv); *right0 = *sw = *right--; right0 -= !h; sw += h;
    }
    while (1) {
        while (right0 > right + UNROLL && left <= right - UNROLL) {
            for (int i = UNROLL; i--;) {
                unsigned h = IS_LOWER(*left, piv); *left0 = *right0 = *left++; left0 += h; right0 -= !h;
            }
        }
        while (left0 < left - UNROLL && left <= right - UNROLL) {
            for (int i = UNROLL; i--;) {
                unsigned h = IS_LOWER(*right, piv); *right0 = *left0 = *right--; right0 -= !h; left0 += h;
            }
        }
        if (left > right - UNROLL) break;
    }
    while (right0 > right && left <= right) {
        unsigned h = IS_LOWER(*left, piv); *left0 = *right0 = *left++; left0 += h; right0 -= !h;
    }
    while (left <= right) {
        unsigned h = IS_LOWER(*right, piv); *right0 = *left0 = *right--; right0 -= !h; left0 += h;
    }
    memcpy(left0, swbuf, (sw - swbuf) * sizeof(TYPE));

    *outerleft = *right0;
    *right0 = piv;
    return right0;
}

// ---------------------- heapsort, testing and main -----------------------------------
.
.
Sorting 50 million numbers Optimized Branch-Avoidant Quicksort ...
0.870s


Multithreaded sorting

Interactive sorting demo


christof.kaser@gmail.com