Fast Branch-Avoidant Quicksort with Sorting-Networks

On modern CPUs, avoiding branch misprediction is an effective way to speed up programs.

When ‘if’ slows you down, avoid it

Quicksort in C - Performance Comparison (Single-Thread)

Performance results naturally depend on the underlying hardware. The following benchmarks show the execution times for sorting 50 million integers using different sorting implementations. The measurements were taken on an Apple M1 system and on an Intel Xeon (E5-2680) system with the -O3 compiler option.

Implementation Apple M1 Intel Xeon
Baseline Quicksort 3.70s 5.80s
Branch-Avoidant Simple 1.70s 3.41s
std::sort (C++) 1.19s 4.95s
pdqsort 1.20s 2.35s
ips4o 1.22s 2.02s
Branch-Avoidant Sorting-Networks 12 0.91s 1.80s
Branch-Avoidant Sorting-Networks 26 0.86s 1.74s

Full source code is included on the page in scrollable blocks.

Baseline Quicksort

This is a simple iterative quicksort written in C using a Lomuto-style partition, as commonly found in textbooks.

The algorithm always continues with the larger partition while storing the smaller one on a manually managed stack, in order to guarantee that a stack overflow cannot occur.

The element type TYPE and comparison logic IS_LOWER are defined via C macros at compile time. This provides the flexibility to sort any data type - for example, floating-point numbers, structures, or pointers to C strings.

// SPDX-License-Identifier: MIT
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int

#define NAME "Baseline Quicksort"

TYPE* partition(TYPE* left, TYPE* right);

void sort(TYPE* data, int len) {

    TYPE* left = data;
    TYPE* stack[80];    // for max 2^40 items
    unsigned sp = 0;
    TYPE* right = data + len - 1;

    while (1) {
        long partsz = right - left;
        if (partsz > 0) {
            TYPE* mid = partition(left, right);
            if (mid < left + partsz / 2) {
                stack[sp] = mid + 1;
                stack[sp + 1] = right;
                right = mid - 1;
            }
            else {
                stack[sp] = left;
                stack[sp + 1] = mid - 1;
                left = mid + 1;
            }
            sp += 2;
        }
        else {
            if (sp == 0) break;
            sp -= 2;
            left = stack[sp];
            right = stack[sp + 1];
        }
    }
}

// ---------------------- partition ----------------------------

#define swap(a, b) do { TYPE _h = a; a = b; b = _h; } while (0)

TYPE* partition(TYPE* left, TYPE* right)
{
    TYPE* outerleft = left;
    TYPE* pivp = left + (right - left) / 2;

    TYPE piv = *pivp;
    *pivp = *outerleft;
    TYPE* mid = left;
    left += 1;

    while (left <= right) {
        if (IS_LOWER(*left, piv)) {
            mid++;
            swap(*mid, *left);
        }
        left++;
    }
    *outerleft = *mid;
    *mid = piv;
    return mid;
}

// ---------------------- testing and main -----------------------------------

unsigned chksum;
unsigned hash_el(void *p, int sz) {
    unsigned char* c = (unsigned char*)p;
    unsigned h = 0;
    for (int i = 0; i < sz; i++) h = (h * 131) + c[i];
    return h;
}
void init(TYPE* data, int len) {
    chksum = 0;
    for (int i = 0; i < len; i++) {
        data[i] = rand();
        chksum += hash_el(&data[i], sizeof(TYPE));
    }
}
void test(TYPE* data, int len) {
    unsigned chks = hash_el(&data[0], sizeof(TYPE));;
    for (int i = 1; i < len; i++) {
        if (IS_LOWER(data[i], data[i - 1])) {
            printf("ERROR ORDER\n");
            return;
        }
        chks += hash_el(&data[i], sizeof(TYPE));
    }
    if (chks != chksum) printf("ERROR CHKS\n");
}
double t(void) {

    static double twr;
    struct timeval tv;
    gettimeofday(&tv, NULL);
    double h = twr;
    twr = tv.tv_sec + tv.tv_usec / 1000000.0;
    return twr - h;
}

#define SIZE (50 * 1000000)
TYPE data[SIZE];

int main(void) {

    srand(time(NULL));
    init(data, SIZE);
    printf(NAME"\n");
    printf("Sorting %d million integers ...\n", SIZE / 1000000);
    t();
    sort(data, SIZE);
    printf("%.2fs\n", t());
    test(data, SIZE);
    return 0;
}
Baseline Quicksort
Sorting 50 million integers ...
3.70s

Branch-Avoidant

The paper BlockQuicksort: How Branch Mispredictions don’t affect Quicksort by Edelkamp and A. Weiß shows how partitioning performance in Quicksort can be improved by avoiding conditional branches.

The strategy of using an auxiliary buffer for branch-avoidant partitioning is inspired by fluxsort .

The “auxiliary buffer” here means a 1024‑element stack array, not heap memory.

Branchless sorting then works roughly as follows.

int islower = IS_LOWER(*left, piv);
*rwr = *left;
*lrw = *left;
left++;
rwr -= !islower;
lwr += islower;
Simply by using branchless partitioning, the implementation now is more than twice as fast.

// SPDX-License-Identifier: MIT
// (c) 2026 christof.kaser@gmail.com
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int

#define NAME "Branch-Avoidant Simple"

TYPE* partition(TYPE* left, TYPE* right);

void sort(TYPE* data, int len) {

    TYPE* left = data;
    TYPE* stack[80];    // for max 2^40 items
    unsigned sp = 0;
    TYPE* right = data + len - 1;

    while (1) {
        long partsz = right - left;
        if (partsz > 0) {
            TYPE* mid = partition(left, right);
            if (mid < left + partsz / 2) {
                stack[sp] = mid + 1;
                stack[sp + 1] = right;
                right = mid - 1;
            }
            else {
                stack[sp] = left;
                stack[sp + 1] = mid - 1;
                left = mid + 1;
            }
            sp += 2;
        }
        else {
            if (sp == 0) break;
            sp -= 2;
            left = stack[sp];
            right = stack[sp + 1];
        }
    }
}

// ---------------------- partition ----------------------------

TYPE* partition(TYPE* left, TYPE* right) {

    TYPE* outerleft = left;
    TYPE* pivp = left + (right - left) / 2;
    left += 1;
    TYPE piv = *pivp;
    *pivp = *outerleft;

    #define SWSZ 1024
    TYPE swbuf[SWSZ];

    TYPE* lwr = left;
    TYPE* rwr = right;

    TYPE* sw = swbuf;

    while (sw < swbuf + SWSZ  && left <= right) {
        unsigned h = IS_LOWER(*right, piv); *rwr = *sw = *right--; rwr -= !h; sw += h;
    }
    while (left <= right) {
        while (rwr > right && left <= right) {
            unsigned h = IS_LOWER(*left, piv); *lwr = *rwr = *left++; lwr += h; rwr -= !h;
        }
        while (lwr < left && left <= right) {
            unsigned h = IS_LOWER(*right, piv); *rwr = *lwr = *right--; rwr -= !h; lwr += h;
        }
    }
    memcpy(lwr, swbuf, (sw - swbuf) * sizeof(TYPE));

    *outerleft = *rwr;
    *rwr = piv;
    return rwr;
}

// --------------------------  testing and main -----------------------------------

unsigned chksum;
unsigned hash_el(void *p, int sz) {
    unsigned char* c = (unsigned char*)p;
    unsigned h = 0;
    for (int i = 0; i < sz; i++) h = (h * 131) + c[i];
    return h;
}
void init(TYPE* data, int len) {
    chksum = 0;
    for (int i = 0; i < len; i++) {
        data[i] = rand();
        chksum += hash_el(&data[i], sizeof(TYPE));
    }
}
void test(TYPE* data, int len) {
    unsigned chks = hash_el(&data[0], sizeof(TYPE));;
    for (int i = 1; i < len; i++) {
        if (IS_LOWER(data[i], data[i - 1])) {
            printf("ERROR ORDER\n");
            return;
        }
        chks += hash_el(&data[i], sizeof(TYPE));
    }
    if (chks != chksum) printf("ERROR CHKS\n");
}
double t(void) {

    static double twr;
    struct timeval tv;
    gettimeofday(&tv, NULL);
    double h = twr;
    twr = tv.tv_sec + tv.tv_usec / 1000000.0;
    return twr - h;
}

#define SIZE (50 * 1000000)
TYPE data[SIZE];

int main(void) {

    srand(time(NULL));
    init(data, SIZE);
    printf(NAME"\n");
    printf("Sorting %d million integers ...\n", SIZE / 1000000);
    t();
    sort(data, SIZE);
    printf("%.2fs\n", t());
    test(data, SIZE);
    return 0;
}
Branch-Avoidant Simple
Sorting 50 million integers ...
1.70s

C++ std::sort

For comparison C++ std::sort:

// SPDX-License-Identifier: MIT
#include <algorithm>
#include <sys/time.h>

#define TYPE int
#define NAME "std::sort"

void sort(TYPE* data, int len) {
    std::sort(data, data + len);
}

void init(TYPE* data, int len) {
    for (int i = 0; i < len; i++) {
        data[i] = rand();
    }
}
double t(void) {
    static double twr;
    struct timeval tv;
    gettimeofday(&tv, NULL);
    double h = twr;
    twr = tv.tv_sec + tv.tv_usec / 1000000.0;
    return twr - h;
}

#define SIZE (50 * 1000000)
TYPE data[SIZE];

int main(void) {

    srand(time(NULL));
    init(data, SIZE);
    printf(NAME"\n");
    printf("Sorting %d million integers ...\n", SIZE / 1000000);
    t();
    sort(data, SIZE);
    printf("%.2fs\n", t());
    test(data, SIZE);
    return 0;
}
std::sort
Sorting 50 million integers ...
1.19s

Branch-avoidant Quicksort with up to 12 Sorting-Networks

This variant includes several additional optimizations. One of them is protection against pathological input data.

To avoid the O(n²) runtime caused by many duplicate elements, the program switches to heap sort for a segment if it detects a significant imbalance in the partitioning.

The other optimizations focus on performance.

For larger partitions, the algorithm switches to a more robust pivot selection strategy based on a median-of-medians approach to keep partitions well balanced. In addition, critical partitioning loops are explicitly unrolled to reduce loop overhead and to better exploit instruction-level parallelism on modern CPUs.

Small subsets of up to 12 elements are handled using specialized sorting networks. These subsets are sorted directly with a minimal number of swaps. While this approach is faster, it requires separate code paths for each partition size from 2 to 12.

By using a branchless sort‑2 primitive, all comparisons and swaps are expressed as a fixed sequence of operations, removing conditional branches entirely.

Source for sorting networks: Link

As a result, this implementation becomes faster than, for example, std::sort and pdqsort.

// SPDX-License-Identifier: MIT
// (c) 2026 christof.kaser@gmail.com
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#include <stddef.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int

#define NAME "Branch-Avoidant Quicksort Sorting-Network 12"

TYPE* partition(TYPE* left, TYPE* right);
void sorting_network(TYPE* left, int size);
void heap_sort(TYPE* left, TYPE* right);
TYPE* partition_small(TYPE* left, TYPE* right);

#define SMALLPART 256

void sort(TYPE* data, int len) {

    TYPE* left = data;
    TYPE* stack[80];    // for max 2^40 items
    unsigned sp = 0;
    TYPE* right = data + len - 1;

    while (1) {
        ptrdiff_t partsz = right - left;
        TYPE* mid;
        if (partsz <= SMALLPART) {
            if (partsz <= 11) {
                sorting_network(left, partsz);
                goto pop_stack;
            }
            mid = partition_small(left, right);
        }
        else  {
            mid = partition(left, right);
            if ((mid - left) * 16 < partsz) {
                heap_sort(left, right);
                goto pop_stack;
            }
        }
        if (mid < left + partsz / 2) {
            stack[sp] = mid + 1;
            stack[sp + 1] = right;
            right = mid - 1;
        }
        else {
            stack[sp] = left;
            stack[sp + 1] = mid - 1;
            left = mid + 1;
        }
        sp += 2;
        continue;

    pop_stack:
        if (sp == 0) break;
        sp -= 2;
        left = stack[sp];
        right = stack[sp + 1];
    }
}

// ---------------------- sorting network ----------------------------

#define sort2(a, b) do {  \
    TYPE x = a; \
    TYPE y = b; \
    unsigned m = IS_LOWER(x, y); \
    a = m ? x : y;  \
    b = m ? y : x;  \
} while(0)

// https://bertdobbelaere.github.io/sorting_networks.html

#define sort3(a,b,c) do { \
    sort2(a,c);\
    sort2(a,b);\
    sort2(b,c);\
} while(0)

#define sort4(a,b,c,d) do { \
    sort2(a,c);sort2(b,d);\
    sort2(a,b);sort2(c,d);\
    sort2(b,c);\
} while(0)

#define sort5(a,b,c,d,e) do { \
    sort2(a,d);sort2(b,e);\
    sort2(a,c);sort2(b,d);\
    sort2(a,b);sort2(c,e);\
    sort2(b,c);sort2(d,e);\
    sort2(c,d);\
} while(0)

#define sort6(a,b,c,d,e,f) do { \
    sort2(a,f);sort2(b,d);sort2(c,e);\
    sort2(b,c);sort2(d,e);\
    sort2(a,d);sort2(c,f);\
    sort2(a,b);sort2(c,d);sort2(e,f);\
    sort2(b,c);sort2(d,e);\
} while(0)

#define sort7(a,b,c,d,e,f,g) do { \
    sort2(a,g);sort2(c,d);sort2(e,f);\
    sort2(a,c);sort2(b,e);sort2(d,g);\
    sort2(a,b);sort2(c,f);sort2(d,e);\
    sort2(b,c);sort2(e,g);\
    sort2(c,d);sort2(e,f);\
    sort2(b,c);sort2(d,e);sort2(f,g);\
} while(0)

#define sort8(a,b,c,d,e,f,g,h) do { \
    sort2(a,c);sort2(b,d);sort2(e,g);sort2(f,h);\
    sort2(a,e);sort2(b,f);sort2(c,g);sort2(d,h);\
    sort2(a,b);sort2(c,d);sort2(e,f);sort2(g,h);\
    sort2(c,e);sort2(d,f);\
    sort2(b,e);sort2(d,g);\
    sort2(b,c);sort2(d,e);sort2(f,g);\
} while (0)

#define sort9(a,b,c,d,e,f,g,h,i) do { \
    sort2(a,d);sort2(b,h);sort2(c,f);sort2(e,i);\
    sort2(a,h);sort2(c,e);sort2(d,i);sort2(f,g);\
    sort2(a,c);sort2(b,d);sort2(e,f);sort2(h,i);\
    sort2(b,e);sort2(d,g);sort2(f,h);\
    sort2(a,b);sort2(c,e);sort2(d,f);sort2(g,i);\
    sort2(c,d);sort2(e,f);sort2(g,h);\
    sort2(b,c);sort2(d,e);sort2(f,g);\
} while (0)

#define sort10(a,b,c,d,e,f,g,h,i,j) do { \
    sort2(a,i);sort2(b,j);sort2(c,h);sort2(d,f);sort2(e,g);\
    sort2(a,c);sort2(b,e);sort2(f,i);sort2(h,j);\
    sort2(a,d);sort2(c,e);sort2(f,h);sort2(g,j);\
    sort2(a,b);sort2(d,g);sort2(i,j);\
    sort2(b,f);sort2(c,d);sort2(e,i);sort2(g,h);\
    sort2(b,c);sort2(d,f);sort2(e,g);sort2(h,i);\
    sort2(c,d);sort2(e,f);sort2(g,h);\
    sort2(d,e);sort2(f,g);\
} while(0)

#define sort11(a,b,c,d,e,f,g,h,i,j,k) do { \
    sort2(a,j);sort2(b,g);sort2(c,e);sort2(d,h);sort2(f,i);\
    sort2(a,b);sort2(d,f);sort2(e,k);sort2(g,j);sort2(h,i);\
    sort2(b,d);sort2(c,f);sort2(e,h);sort2(i,k);\
    sort2(a,e);sort2(b,c);sort2(d,h);sort2(f,j);sort2(g,i);\
    sort2(a,b);sort2(c,g);sort2(e,f);sort2(h,i);sort2(j,k);\
    sort2(c,e);sort2(d,g);sort2(f,h);sort2(i,j);\
    sort2(b,c);sort2(d,e);sort2(f,g);sort2(h,i);\
    sort2(c,d);sort2(e,f);sort2(g,h);\
} while (0)

#define sort12(a,b,c,d,e,f,g,h,i,j,k,l) do { \
    sort2(a,i);sort2(b,h);sort2(c,g);sort2(d,l);sort2(e,k);sort2(f,j);\
    sort2(a,b);sort2(c,f);sort2(d,e);sort2(g,j);sort2(h,i);sort2(k,l);\
    sort2(a,c);sort2(b,g);sort2(f,k);sort2(j,l);\
    sort2(a,d);sort2(b,c);sort2(e,g);sort2(f,h);sort2(i,l);sort2(j,k);\
    sort2(b,e);sort2(d,f);sort2(g,i);sort2(h,k);\
    sort2(b,d);sort2(c,f);sort2(g,j);sort2(i,k);\
    sort2(c,d);sort2(e,f);sort2(g,h);sort2(i,j);\
    sort2(e,g);sort2(f,h);\
    sort2(d,e);sort2(f,g);sort2(h,i);\
} while (0)

void sorting_network(TYPE* l, int partsz_min_1) {
    // partsz_min_1 == right - left
    switch (partsz_min_1) {
    case 11:
        sort12(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11]);
        break;
    case 10:
        sort11(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9], l[10]);
        break;
    case 9:
        sort10(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9]);
        break;
    case 8:
        sort9(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8]);
        break;
    case 7:
        sort8(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7]);
        break;
    case 6:
        sort7(l[0], l[1], l[2], l[3], l[4], l[5], l[6]);
        break;
    case 5:
        sort6(l[0], l[1], l[2], l[3], l[4], l[5]);
        break;
    case 4:
        sort5(l[0], l[1], l[2], l[3], l[4]);
        break;
    case 3:
        sort4(l[0], l[1], l[2], l[3]);
        break;
    case 2:
        sort3(l[0], l[1], l[2]);
        break;
    case 1:
        sort2(l[0], l[1]);
        break;
    case 0:
        break;
    }
}
// ---------------------- partition ----------------------------

#define med5(a, b, c, d, e) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); sort2(c, e); \
    sort2(b, c); \
} while(0)

TYPE* partition_small(TYPE* restrict  left, TYPE* restrict  right) {
    TYPE* outerleft = left;
    TYPE* pivp = left + (right - left) / 2;

    med5(left[1], left[2], *pivp, right[-1], *right);
    left += 3;
    right -= 2;

    TYPE piv = *pivp;
    *pivp = *outerleft;

    TYPE swbuf[SMALLPART];
    TYPE* sw = swbuf;
    TYPE* lwr = left;
    while (left <= right) {
        unsigned h = IS_LOWER(*left, piv); *lwr = *sw = *left++; lwr += h; sw += !h;
    }
    memcpy(lwr, swbuf, (sw - swbuf) * sizeof(TYPE));
    lwr -= 1;
    *outerleft = *lwr;
    *lwr = piv;
    return lwr;
}

#define SWSZ 2048
#define UNROLL 16

TYPE* partition(TYPE* restrict left, TYPE* restrict right) {

    TYPE* outerleft = left;
    TYPE* pivp = left + (right - left) / 2;

    med5(left[3], left[4], left[1], left[5], left[6]);
    med5(left[11], left[12], left[2], left[13], left[14]);
    med5(pivp[-20], pivp[-10], pivp[0], pivp[10], pivp[20]);
    med5(right[-6], right[-7], right[-1], right[-8], right[-9]);
    med5(right[-15], right[-14], right[0], right[-13], right[-12]);
    med5(left[1], left[2], pivp[0], right[-1], right[0]);
    left += 3;
    right -= 2;

    TYPE piv = *pivp;
    *pivp = *outerleft;

    TYPE swbuf[SWSZ];

    TYPE* lwr = left;
    TYPE* rwr = right;

    TYPE* sw = swbuf;
    while (sw < swbuf + SWSZ - UNROLL && left <= right - UNROLL) {
        for (int i = UNROLL; i--;) {
            unsigned h = IS_LOWER(*right, piv); *rwr = *sw = *right--; rwr -= !h; sw += h;
        }
    }
    while (sw < swbuf + SWSZ - UNROLL && left <= right) {
        unsigned h = IS_LOWER(*right, piv); *rwr = *sw = *right--; rwr -= !h; sw += h;
    }
    while (left <= right - UNROLL) {
        while (rwr > right + UNROLL && left <= right - UNROLL) {
            for (int i = UNROLL; i--;) {
                unsigned h = IS_LOWER(*left, piv); *lwr = *rwr = *left++; lwr += h; rwr -= !h;
            }
        }
        while (lwr < left - UNROLL && left <= right - UNROLL) {
            for (int i = UNROLL; i--;) {
                unsigned h = IS_LOWER(*right, piv); *rwr = *lwr = *right--; rwr -= !h; lwr += h;
            }
        }
    }
    while (rwr > right && left <= right) {
        unsigned h = IS_LOWER(*left, piv); *lwr = *rwr = *left++; lwr += h; rwr -= !h;
    }
    while (left <= right) {
        unsigned h = IS_LOWER(*right, piv); *rwr = *lwr = *right--; rwr -= !h; lwr += h;
    }
    memcpy(lwr, swbuf, (sw - swbuf) * sizeof(TYPE));

    *outerleft = *rwr;
    *rwr = piv;
    return rwr;
}

// ---------------------- heap_sort, testing and main -----------------------------------

void heap_sort(TYPE* left, TYPE* right) {

    long n = right - left + 1;
    if (n < 2) return;
    for (long i = n / 2, j; ; ) {
        TYPE k;
        if (i > 0) {
            k = left[--i];
        }
        else {
            n -= 1;
            if (n == 0) return;
            k = left[n];
            left[n] = left[0];
        }
        j = i;
        while (j * 2 + 1 < n) {
            long child = j * 2 + 1;
            if (child + 1 < n && IS_LOWER(left[child], left[child + 1])) child++;
            if (!IS_LOWER(k, left[child])) break;
            left[j] = left[child];
            j = child;
        }
        left[j] = k;
    }
}

unsigned chksum;
unsigned hash_el(void *p, int sz) {
    unsigned char* c = (unsigned char*)p;
    unsigned h = 0;
    for (int i = 0; i < sz; i++) h = (h * 131) + c[i];
    return h;
}
void init(TYPE* data, int len) {
    chksum = 0;
    for (int i = 0; i < len; i++) {
        data[i] = rand();
        chksum += hash_el(&data[i], sizeof(TYPE));
    }
}
void test(TYPE* data, int len) {
    unsigned chks = hash_el(&data[0], sizeof(TYPE));;
    for (int i = 1; i < len; i++) {
        if (IS_LOWER(data[i], data[i - 1])) {
            printf("ERROR ORDER\n");
            return;
        }
        chks += hash_el(&data[i], sizeof(TYPE));
    }
    if (chks != chksum) printf("ERROR CHKS\n");
}
double t(void) {

    static double twr;
    struct timeval tv;
    gettimeofday(&tv, NULL);
    double h = twr;
    twr = tv.tv_sec + tv.tv_usec / 1000000.0;
    return twr - h;
}

#define SIZE (50 * 1000000)
TYPE data[SIZE];

int main(void) {
    srand(time(NULL));
    init(data, SIZE);
    printf(NAME"\n");
    printf("Sorting %d million integers ...\n", SIZE / 1000000);
    t();
    sort(data, SIZE);
    printf("%.2fs\n", t());
    test(data, SIZE);

    return 0;
}
Branch-Avoidant Quicksort Sorting-Network 12
Sorting 50 million integers ...
0.91s

Branch-avoidant Quicksort with up to 26 Sorting-Networks

We can make this even faster relatively easily by extending the sorting networks.

Of course, this inevitably increases the amount of code, because larger sorting networks require significantly more instructions.

However, by implementing sorting networks for sizes up to 26 elements, we can squeeze out a few additional percent of performance.

// SPDX-License-Identifier: MIT
// (c) 2026 christof.kaser@gmail.com
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#include <stddef.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int

#define NAME "Branch-Avoidant Quicksort Sorting-Network 26"

TYPE* partition(TYPE* left, TYPE* right);
void sorting_network(TYPE* left, int size);
void heap_sort(TYPE* left, TYPE* right);
TYPE* partition_small(TYPE* left, TYPE* right);

#define SMALLPART 256

void sort(TYPE* data, int len) {

    TYPE* left = data;
    TYPE* stack[80];    // for max 2^40 items
    unsigned sp = 0;
    TYPE* right = data + len - 1;

    while (1) {
        ptrdiff_t partsz = right - left;
        TYPE* mid;
        if (partsz <= SMALLPART) {
            if (partsz <= 25) {
                sorting_network(left, partsz);
                goto pop_stack;
            }
            mid = partition_small(left, right);
        }
        else  {
            mid = partition(left, right);
            if ((mid - left) * 16 < partsz) {
                heap_sort(left, right);
                goto pop_stack;
            }
        }
        if (mid < left + partsz / 2) {
            stack[sp] = mid + 1;
            stack[sp + 1] = right;
            right = mid - 1;
        }
        else {
            stack[sp] = left;
            stack[sp + 1] = mid - 1;
            left = mid + 1;
        }
        sp += 2;
        continue;

    pop_stack:
        if (sp == 0) break;
        sp -= 2;
        left = stack[sp];
        right = stack[sp + 1];
    }
}

// ---------------------- sorting network ----------------------------

#define sort2(a, b) do {  \
    TYPE x = a; \
    TYPE y = b; \
    unsigned m = IS_LOWER(x, y); \
    a = m ? x : y;  \
    b = m ? y : x;  \
} while(0)

// https://bertdobbelaere.github.io/sorting_networks.html

#define sort3(a,b,c) do { \
    sort2(a,c);\
    sort2(a,b);\
    sort2(b,c);\
} while(0)

#define sort4(a,b,c,d) do { \
    sort2(a,c);sort2(b,d);\
    sort2(a,b);sort2(c,d);\
    sort2(b,c);\
} while(0)

#define sort5(a,b,c,d,e) do { \
    sort2(a,d);sort2(b,e);\
    sort2(a,c);sort2(b,d);\
    sort2(a,b);sort2(c,e);\
    sort2(b,c);sort2(d,e);\
    sort2(c,d);\
} while(0)

#define sort6(a,b,c,d,e,f) do { \
    sort2(a,f);sort2(b,d);sort2(c,e);\
    sort2(b,c);sort2(d,e);\
    sort2(a,d);sort2(c,f);\
    sort2(a,b);sort2(c,d);sort2(e,f);\
    sort2(b,c);sort2(d,e);\
} while(0)

#define sort7(a,b,c,d,e,f,g) do { \
    sort2(a,g);sort2(c,d);sort2(e,f);\
    sort2(a,c);sort2(b,e);sort2(d,g);\
    sort2(a,b);sort2(c,f);sort2(d,e);\
    sort2(b,c);sort2(e,g);\
    sort2(c,d);sort2(e,f);\
    sort2(b,c);sort2(d,e);sort2(f,g);\
} while(0)

#define sort8(a,b,c,d,e,f,g,h) do { \
    sort2(a,c);sort2(b,d);sort2(e,g);sort2(f,h);\
    sort2(a,e);sort2(b,f);sort2(c,g);sort2(d,h);\
    sort2(a,b);sort2(c,d);sort2(e,f);sort2(g,h);\
    sort2(c,e);sort2(d,f);\
    sort2(b,e);sort2(d,g);\
    sort2(b,c);sort2(d,e);sort2(f,g);\
} while (0)

#define sort9(a,b,c,d,e,f,g,h,i) do { \
    sort2(a,d);sort2(b,h);sort2(c,f);sort2(e,i);\
    sort2(a,h);sort2(c,e);sort2(d,i);sort2(f,g);\
    sort2(a,c);sort2(b,d);sort2(e,f);sort2(h,i);\
    sort2(b,e);sort2(d,g);sort2(f,h);\
    sort2(a,b);sort2(c,e);sort2(d,f);sort2(g,i);\
    sort2(c,d);sort2(e,f);sort2(g,h);\
    sort2(b,c);sort2(d,e);sort2(f,g);\
} while (0)

#define sort10(a,b,c,d,e,f,g,h,i,j) do { \
    sort2(a,i);sort2(b,j);sort2(c,h);sort2(d,f);sort2(e,g);\
    sort2(a,c);sort2(b,e);sort2(f,i);sort2(h,j);\
    sort2(a,d);sort2(c,e);sort2(f,h);sort2(g,j);\
    sort2(a,b);sort2(d,g);sort2(i,j);\
    sort2(b,f);sort2(c,d);sort2(e,i);sort2(g,h);\
    sort2(b,c);sort2(d,f);sort2(e,g);sort2(h,i);\
    sort2(c,d);sort2(e,f);sort2(g,h);\
    sort2(d,e);sort2(f,g);\
} while(0)

#define sort11(a,b,c,d,e,f,g,h,i,j,k) do { \
    sort2(a,j);sort2(b,g);sort2(c,e);sort2(d,h);sort2(f,i);\
    sort2(a,b);sort2(d,f);sort2(e,k);sort2(g,j);sort2(h,i);\
    sort2(b,d);sort2(c,f);sort2(e,h);sort2(i,k);\
    sort2(a,e);sort2(b,c);sort2(d,h);sort2(f,j);sort2(g,i);\
    sort2(a,b);sort2(c,g);sort2(e,f);sort2(h,i);sort2(j,k);\
    sort2(c,e);sort2(d,g);sort2(f,h);sort2(i,j);\
    sort2(b,c);sort2(d,e);sort2(f,g);sort2(h,i);\
    sort2(c,d);sort2(e,f);sort2(g,h);\
} while (0)

#define sort12(a,b,c,d,e,f,g,h,i,j,k,l) do { \
    sort2(a,i);sort2(b,h);sort2(c,g);sort2(d,l);sort2(e,k);sort2(f,j);\
    sort2(a,b);sort2(c,f);sort2(d,e);sort2(g,j);sort2(h,i);sort2(k,l);\
    sort2(a,c);sort2(b,g);sort2(f,k);sort2(j,l);\
    sort2(a,d);sort2(b,c);sort2(e,g);sort2(f,h);sort2(i,l);sort2(j,k);\
    sort2(b,e);sort2(d,f);sort2(g,i);sort2(h,k);\
    sort2(b,d);sort2(c,f);sort2(g,j);sort2(i,k);\
    sort2(c,d);sort2(e,f);sort2(g,h);sort2(i,j);\
    sort2(e,g);sort2(f,h);\
    sort2(d,e);sort2(f,g);sort2(h,i);\
} while (0)

#define sort13(a,b,c,d,e,f,g,h,i,j,k,l,m) do { \
    sort2(a,m);sort2(b,k);sort2(c,j);sort2(d,h);sort2(f,l);sort2(g,i);\
    sort2(b,g);sort2(c,d);sort2(e,l);sort2(h,j);sort2(i,k);\
    sort2(a,e);sort2(b,c);sort2(d,g);sort2(h,i);sort2(j,k);sort2(l,m);\
    sort2(e,g);sort2(f,j);sort2(i,l);sort2(k,m);\
    sort2(a,f);sort2(d,i);sort2(e,h);sort2(g,l);sort2(j,k);\
    sort2(a,b);sort2(c,f);sort2(g,j);sort2(h,i);sort2(k,l);\
    sort2(b,d);sort2(c,e);sort2(f,g);sort2(j,k);\
    sort2(b,c);sort2(d,e);sort2(f,h);sort2(g,i);\
    sort2(c,d);sort2(e,f);sort2(g,h);sort2(i,j);\
    sort2(d,e);sort2(f,g);\
} while (0)

#define sort14(a,b,c,d,e,f,g,h,i,j,k,l,m,n) do { \
    sort2(a,b);sort2(c,d);sort2(e,f);sort2(g,h);sort2(i,j);sort2(k,l);sort2(m,n);\
    sort2(a,c);sort2(b,d);sort2(e,i);sort2(f,j);sort2(k,m);sort2(l,n);\
    sort2(a,e);sort2(b,c);sort2(d,h);sort2(f,i);sort2(g,k);sort2(j,n);sort2(l,m);\
    sort2(a,g);sort2(b,f);sort2(d,j);sort2(e,k);sort2(h,n);sort2(i,m);\
    sort2(c,k);sort2(d,l);sort2(e,g);sort2(h,j);\
    sort2(b,d);sort2(c,i);sort2(f,l);sort2(g,h);sort2(k,m);\
    sort2(b,e);sort2(c,g);sort2(d,f);sort2(h,l);sort2(i,k);sort2(j,m);\
    sort2(c,e);sort2(d,g);sort2(f,i);sort2(h,k);sort2(j,l);\
    sort2(d,e);sort2(f,g);sort2(h,i);sort2(j,k);\
    sort2(g,h);\
} while(0)

#define sort15(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) do { \
    sort2(b,c);sort2(d,k);sort2(e,o);sort2(f,i);sort2(g,n);sort2(h,m);sort2(j,l);\
    sort2(a,o);sort2(b,f);sort2(c,i);sort2(d,h);sort2(g,j);sort2(k,m);sort2(l,n);\
    sort2(a,h);sort2(b,g);sort2(c,j);sort2(e,k);sort2(f,l);sort2(i,n);sort2(m,o);\
    sort2(a,g);sort2(c,e);sort2(d,f);sort2(h,l);sort2(i,k);sort2(j,m);sort2(n,o);\
    sort2(a,d);sort2(b,c);sort2(e,h);sort2(f,j);sort2(g,i);sort2(k,l);sort2(m,n);\
    sort2(a,b);sort2(c,d);sort2(e,g);sort2(h,j);sort2(k,m);sort2(l,n);\
    sort2(b,c);sort2(d,f);sort2(i,k);sort2(l,m);\
    sort2(d,e);sort2(f,g);sort2(h,i);sort2(j,k);\
    sort2(c,d);sort2(e,f);sort2(g,h);sort2(i,j);sort2(k,l);\
    sort2(f,g);sort2(h,i);\
} while(0)

#define sort16(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) do { \
    sort2(a,n);sort2(b,m);sort2(c,p);sort2(d,o);sort2(e,i);\
    sort2(f,g);sort2(h,l);sort2(j,k);\
    sort2(a,f);sort2(b,h);sort2(c,j);sort2(d,e);sort2(g,n);\
    sort2(i,o);sort2(k,p);sort2(l,m);\
    sort2(a,b);sort2(c,d);sort2(e,f);sort2(g,i);sort2(h,j);\
    sort2(k,l);sort2(m,n);sort2(o,p);\
    sort2(a,c);sort2(b,d);sort2(e,k);sort2(f,l);sort2(g,h);\
    sort2(i,j);sort2(m,o);sort2(n,p);\
    sort2(b,c);sort2(d,m);sort2(e,g);sort2(f,h);sort2(i,k);\
    sort2(j,l);sort2(n,o);\
    sort2(b,e);sort2(c,g);sort2(f,i);sort2(h,k);sort2(j,n);\
    sort2(l,o);\
    sort2(c,e);sort2(d,g);sort2(j,m);sort2(l,n);\
    sort2(d,f);sort2(g,i);sort2(h,j);sort2(k,m);\
    sort2(d,e);sort2(f,g);sort2(h,i);sort2(j,k);sort2(l,m);\
    sort2(g,h);sort2(i,j);\
} while (0)

#define sort17(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q) do { \
    sort2(a,l);sort2(b,p);sort2(c,k);sort2(d,f);sort2(e,g);\
    sort2(i,m);sort2(j,q);sort2(n,o);\
    sort2(a,g);sort2(b,n);sort2(c,i);sort2(e,o);sort2(f,p);\
    sort2(h,l);\
    sort2(a,i);sort2(d,h);sort2(e,j);sort2(g,q);sort2(k,l);\
    sort2(m,o);\
    sort2(a,c);sort2(b,e);sort2(f,g);sort2(h,n);sort2(i,j);\
    sort2(k,m);sort2(l,o);sort2(p,q);\
    sort2(a,d);sort2(c,f);sort2(g,l);sort2(h,k);sort2(j,n);\
    sort2(m,p);sort2(o,q);\
    sort2(a,b);sort2(d,e);sort2(f,k);sort2(g,j);sort2(h,i);\
    sort2(l,p);sort2(n,o);\
    sort2(b,c);sort2(d,h);sort2(e,i);sort2(g,m);sort2(l,n);\
    sort2(o,p);\
    sort2(b,d);sort2(c,h);sort2(e,f);sort2(j,l);sort2(k,m);\
    sort2(n,o);\
    sort2(c,d);sort2(e,g);sort2(f,h);sort2(i,k);\
    sort2(d,e);sort2(g,i);sort2(h,j);sort2(k,m);\
    sort2(f,g);sort2(h,i);sort2(j,k);sort2(l,m);\
    sort2(e,f);sort2(g,h);sort2(i,j);sort2(k,l);sort2(m,n);\
} while(0)

#define sort18(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r) do { \
    sort2(a,b);sort2(c,d);sort2(e,f);sort2(g,h);sort2(i,j);\
    sort2(k,l);sort2(m,n);sort2(o,p);sort2(q,r);\
    sort2(a,c);sort2(b,d);sort2(e,m);sort2(f,n);sort2(g,i);\
    sort2(j,l);sort2(o,q);sort2(p,r);\
    sort2(a,o);sort2(b,q);sort2(c,p);sort2(d,r);\
    sort2(a,g);sort2(b,k);sort2(c,j);sort2(h,q);sort2(i,p);\
    sort2(l,r);\
    sort2(b,e);sort2(d,j);sort2(f,h);sort2(i,o);sort2(k,m);\
    sort2(n,q);\
    sort2(a,b);sort2(c,f);sort2(d,n);sort2(e,o);sort2(h,j);\
    sort2(i,k);sort2(m,p);sort2(q,r);\
    sort2(b,c);sort2(d,f);sort2(e,g);sort2(l,n);sort2(m,o);\
    sort2(p,q);\
    sort2(e,i);sort2(f,m);sort2(g,k);sort2(h,l);sort2(j,n);\
    sort2(b,e);sort2(c,i);sort2(d,g);sort2(f,h);sort2(j,p);\
    sort2(k,m);sort2(l,o);sort2(n,q);\
    sort2(c,e);sort2(f,i);sort2(g,k);sort2(h,l);sort2(j,m);\
    sort2(n,p);\
    sort2(d,f);sort2(g,i);sort2(h,k);sort2(j,l);sort2(m,o);\
    sort2(d,e);sort2(f,g);sort2(h,i);sort2(j,k);sort2(l,m);\
    sort2(n,o);\
} while(0)

#define sort19(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s) do { \
    sort2(a,m);sort2(b,e);sort2(c,i);sort2(d,f);sort2(g,r);\
    sort2(h,l);sort2(j,o);sort2(k,n);sort2(p,q);\
    sort2(a,c);sort2(b,h);sort2(d,g);sort2(e,l);sort2(f,r);\
    sort2(i,m);sort2(k,p);sort2(n,q);sort2(o,s);\
    sort2(d,k);sort2(e,o);sort2(f,p);sort2(g,n);sort2(h,j);\
    sort2(l,r);sort2(q,s);\
    sort2(a,h);sort2(b,k);sort2(e,g);sort2(j,p);sort2(l,q);\
    sort2(m,r);sort2(n,o);\
    sort2(a,d);sort2(c,g);sort2(f,h);sort2(i,l);sort2(m,q);\
    sort2(b,i);sort2(c,j);sort2(d,e);sort2(g,p);sort2(h,n);\
    sort2(k,l);sort2(m,s);\
    sort2(b,d);sort2(c,f);sort2(g,j);sort2(h,m);sort2(i,k);\
    sort2(l,o);sort2(r,s);\
    sort2(a,b);sort2(c,d);sort2(e,i);sort2(g,k);sort2(j,m);\
    sort2(o,p);sort2(q,r);\
    sort2(b,c);sort2(f,i);sort2(g,h);sort2(j,l);sort2(k,n);\
    sort2(o,q);sort2(p,r);\
    sort2(d,g);sort2(e,f);sort2(h,j);sort2(i,k);sort2(l,m);\
    sort2(n,o);sort2(p,q);\
    sort2(d,e);sort2(f,g);sort2(h,i);sort2(j,k);sort2(l,n);\
    sort2(m,o);\
    sort2(c,d);sort2(e,f);sort2(g,h);sort2(i,j);sort2(k,l);\
    sort2(m,n);sort2(o,p);\
} while(0)

#define sort20(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t) do { \
    sort2(a,d);sort2(b,h);sort2(c,f);sort2(e,i);sort2(g,j);\
    sort2(k,n);sort2(l,p);sort2(m,s);sort2(o,r);sort2(q,t);\
    sort2(a,o);sort2(b,l);sort2(c,q);sort2(d,r);sort2(e,m);\
    sort2(f,t);sort2(g,k);sort2(h,p);sort2(i,s);sort2(j,n);\
    sort2(a,e);sort2(b,c);sort2(d,i);sort2(f,h);sort2(l,q);\
    sort2(m,o);sort2(p,t);sort2(r,s);\
    sort2(b,g);sort2(c,m);sort2(d,f);sort2(e,l);sort2(h,r);\
    sort2(i,p);sort2(n,s);sort2(o,q);\
    sort2(a,b);sort2(c,g);sort2(h,k);sort2(j,m);sort2(n,r);\
    sort2(s,t);\
    sort2(b,g);sort2(f,j);sort2(h,l);sort2(i,m);sort2(k,o);\
    sort2(n,s);\
    sort2(d,f);sort2(e,h);sort2(i,k);sort2(j,l);sort2(m,p);\
    sort2(o,q);\
    sort2(b,d);sort2(c,e);sort2(f,h);sort2(g,k);sort2(j,n);\
    sort2(m,o);sort2(p,r);sort2(q,s);\
    sort2(b,c);sort2(d,e);sort2(g,h);sort2(i,j);sort2(k,l);\
    sort2(m,n);sort2(p,q);sort2(r,s);\
    sort2(c,d);sort2(e,g);sort2(f,i);sort2(h,j);sort2(k,m);\
    sort2(l,o);sort2(n,p);sort2(q,r);\
    sort2(e,f);sort2(g,i);sort2(h,k);sort2(j,m);sort2(l,n);\
    sort2(o,p);\
    sort2(d,e);sort2(f,g);sort2(h,i);sort2(j,k);sort2(l,m);\
    sort2(n,o);sort2(p,q);\
} while(0)

#define sort21(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u) do { \
    sort2(a,b);sort2(c,d);sort2(e,f);sort2(g,h);sort2(i,j);\
    sort2(k,l);sort2(m,n);sort2(o,p);sort2(q,r);sort2(s,t);\
    sort2(a,c);sort2(b,d);sort2(e,g);sort2(f,h);sort2(i,k);\
    sort2(j,l);sort2(m,o);sort2(n,p);sort2(q,s);sort2(r,t);\
    sort2(a,i);sort2(b,j);sort2(c,k);sort2(d,l);sort2(e,m);\
    sort2(f,n);sort2(g,o);sort2(h,p);\
    sort2(a,e);sort2(b,f);sort2(d,h);sort2(g,u);sort2(i,m);\
    sort2(j,n);sort2(k,o);sort2(p,t);\
    sort2(c,g);sort2(d,s);sort2(h,u);\
    sort2(c,q);sort2(d,g);sort2(f,s);sort2(h,r);sort2(l,u);\
    sort2(a,c);sort2(d,i);sort2(g,m);sort2(h,k);sort2(j,q);\
    sort2(l,p);sort2(n,r);sort2(o,s);sort2(t,u);\
    sort2(b,h);sort2(c,d);sort2(e,j);sort2(k,l);sort2(n,q);\
    sort2(p,s);sort2(r,t);\
    sort2(b,e);sort2(f,k);sort2(g,n);sort2(h,i);sort2(l,o);\
    sort2(m,q);sort2(p,r);sort2(s,t);\
    sort2(b,c);sort2(d,e);sort2(f,g);sort2(k,m);sort2(l,n);\
    sort2(o,q);sort2(r,s);\
    sort2(c,d);sort2(e,f);sort2(g,j);sort2(k,l);sort2(m,n);\
    sort2(o,p);sort2(q,r);\
    sort2(g,h);sort2(i,j);sort2(p,q);\
    sort2(e,g);sort2(h,i);sort2(j,m);sort2(n,p);\
    sort2(d,e);sort2(f,h);sort2(i,k);sort2(j,l);sort2(m,o);\
    sort2(f,g);sort2(h,i);sort2(j,k);sort2(l,m);sort2(n,o);\
} while(0)

#define sort22(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v) do { \
    sort2(a,b);sort2(c,d);sort2(e,f);sort2(g,h);sort2(i,j);\
    sort2(k,l);sort2(m,n);sort2(o,p);sort2(q,r);sort2(s,t);\
    sort2(u,v);\
    sort2(a,c);sort2(b,d);sort2(e,g);sort2(f,h);sort2(i,m);\
    sort2(j,n);sort2(o,q);sort2(p,r);sort2(s,u);sort2(t,v);\
    sort2(a,e);sort2(b,f);sort2(c,g);sort2(d,h);sort2(i,k);\
    sort2(j,m);sort2(l,n);sort2(o,s);sort2(p,t);sort2(q,u);\
    sort2(r,v);\
    sort2(a,o);sort2(b,p);sort2(c,s);sort2(d,t);sort2(e,q);\
    sort2(f,r);sort2(g,u);sort2(h,v);sort2(j,l);sort2(k,m);\
    sort2(c,i);sort2(d,l);sort2(g,j);sort2(k,s);sort2(m,p);\
    sort2(n,t);\
    sort2(a,c);sort2(b,k);sort2(d,q);sort2(f,s);sort2(g,o);\
    sort2(h,p);sort2(i,m);sort2(j,n);sort2(l,u);sort2(t,v);\
    sort2(c,g);sort2(d,k);sort2(e,i);sort2(f,m);sort2(j,q);\
    sort2(l,s);sort2(n,r);sort2(p,t);\
    sort2(b,e);sort2(h,n);sort2(i,o);sort2(j,m);sort2(r,u);\
    sort2(b,c);sort2(d,i);sort2(e,g);sort2(h,l);sort2(k,o);\
    sort2(n,s);sort2(p,r);sort2(t,u);\
    sort2(c,e);sort2(f,k);sort2(h,j);sort2(l,q);sort2(m,o);\
    sort2(r,t);\
    sort2(f,g);sort2(h,i);sort2(j,l);sort2(k,m);sort2(n,o);\
    sort2(p,q);\
    sort2(d,f);sort2(g,h);sort2(i,k);sort2(j,m);sort2(l,n);\
    sort2(o,p);sort2(q,s);\
    sort2(d,e);sort2(f,g);sort2(h,i);sort2(j,k);sort2(l,m);\
    sort2(n,o);sort2(p,q);sort2(r,s);\
} while(0)

#define sort23(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w) do { \
    sort2(a,b);sort2(c,d);sort2(e,f);sort2(g,h);sort2(i,j);\
    sort2(k,l);sort2(m,n);sort2(o,p);sort2(q,r);sort2(s,t);\
    sort2(u,v);\
    sort2(a,c);sort2(b,d);sort2(e,g);sort2(f,h);sort2(i,k);\
    sort2(j,l);sort2(m,o);sort2(n,p);sort2(q,s);sort2(r,t);\
    sort2(v,w);\
    sort2(a,e);sort2(b,f);sort2(c,g);sort2(d,h);sort2(i,m);\
    sort2(j,n);sort2(k,o);sort2(l,p);sort2(r,v);sort2(s,u);\
    sort2(t,w);\
    sort2(a,i);sort2(b,j);sort2(c,k);sort2(d,l);sort2(e,m);\
    sort2(f,n);sort2(g,o);sort2(h,p);\
    sort2(b,c);sort2(f,s);sort2(h,t);sort2(j,q);sort2(k,v);\
    sort2(m,u);sort2(p,w);\
    sort2(f,j);sort2(g,h);sort2(k,s);sort2(l,v);sort2(m,r);\
    sort2(n,u);sort2(o,p);\
    sort2(d,r);sort2(g,q);sort2(h,o);sort2(i,m);sort2(p,t);\
    sort2(u,v);\
    sort2(d,e);sort2(f,i);sort2(g,k);sort2(j,m);sort2(n,q);\
    sort2(o,p);sort2(r,s);sort2(t,v);\
    sort2(a,f);sort2(b,i);sort2(c,m);sort2(d,j);sort2(e,k);\
    sort2(h,n);sort2(l,r);sort2(o,q);sort2(s,u);\
    sort2(c,g);sort2(d,f);sort2(e,i);sort2(h,l);sort2(k,m);\
    sort2(n,s);sort2(o,r);sort2(p,u);\
    sort2(b,d);sort2(c,f);sort2(g,j);sort2(h,k);sort2(l,n);\
    sort2(m,o);sort2(p,s);sort2(q,r);sort2(t,u);\
    sort2(c,d);sort2(e,g);sort2(i,j);sort2(l,m);sort2(n,o);\
    sort2(p,q);sort2(r,t);\
    sort2(d,e);sort2(f,g);sort2(h,i);sort2(j,k);sort2(m,n);\
    sort2(o,p);sort2(r,s);\
    sort2(e,f);sort2(g,h);sort2(i,j);sort2(k,l);sort2(q,r);\
} while(0)

#define sort24(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x) do { \
    sort2(a,u);sort2(b,m);sort2(c,q);sort2(d,x);sort2(e,g);\
    sort2(f,k);sort2(h,v);sort2(i,o);sort2(j,p);sort2(l,w);\
    sort2(n,s);sort2(r,t);\
    sort2(a,d);sort2(b,l);sort2(c,h);sort2(e,r);sort2(f,n);\
    sort2(g,t);sort2(i,j);sort2(k,s);sort2(m,w);sort2(o,p);\
    sort2(q,v);sort2(u,x);\
    sort2(a,b);sort2(c,e);sort2(d,m);sort2(f,i);sort2(g,j);\
    sort2(h,k);sort2(l,u);sort2(n,q);sort2(o,r);sort2(p,s);\
    sort2(t,v);sort2(w,x);\
    sort2(c,f);sort2(e,i);sort2(g,l);sort2(h,o);sort2(j,q);\
    sort2(m,r);sort2(p,t);sort2(s,v);\
    sort2(b,i);sort2(d,o);sort2(e,h);sort2(j,u);sort2(k,m);\
    sort2(l,n);sort2(p,w);sort2(q,t);\
    sort2(a,h);sort2(b,f);sort2(d,e);sort2(g,l);sort2(i,p);\
    sort2(j,o);sort2(k,n);sort2(m,r);sort2(q,x);sort2(s,w);\
    sort2(t,u);\
    sort2(a,c);sort2(b,g);sort2(e,h);sort2(f,j);sort2(i,k);\
    sort2(n,p);sort2(o,s);sort2(q,t);sort2(r,w);sort2(v,x);\
    sort2(c,d);sort2(e,f);sort2(g,i);sort2(h,j);sort2(k,l);\
    sort2(m,n);sort2(o,q);sort2(p,r);sort2(s,t);sort2(u,v);\
    sort2(b,c);sort2(d,g);sort2(e,k);sort2(h,i);sort2(j,l);\
    sort2(m,o);sort2(n,t);sort2(p,q);sort2(r,u);sort2(v,w);\
    sort2(c,d);sort2(f,k);sort2(g,h);sort2(i,j);sort2(n,s);\
    sort2(o,p);sort2(q,r);sort2(u,v);\
    sort2(d,e);sort2(f,h);sort2(k,m);sort2(l,n);sort2(q,s);\
    sort2(t,u);\
    sort2(e,g);sort2(i,k);sort2(j,m);sort2(l,o);sort2(n,p);\
    sort2(r,t);\
    sort2(f,g);sort2(h,i);sort2(j,k);sort2(l,m);sort2(n,o);\
    sort2(p,q);sort2(r,s);\
} while(0)

#define sort25(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y) do { \
    sort2(a,b);sort2(c,d);sort2(e,f);sort2(g,h);sort2(i,j);\
    sort2(k,l);sort2(m,n);sort2(o,p);sort2(q,r);sort2(s,t);\
    sort2(u,v);sort2(w,x);\
    sort2(a,c);sort2(b,d);sort2(e,g);sort2(f,h);sort2(i,k);\
    sort2(j,l);sort2(m,o);sort2(n,p);sort2(q,s);sort2(r,t);\
    sort2(v,w);sort2(x,y);\
    sort2(a,e);sort2(b,f);sort2(c,g);sort2(d,h);sort2(i,m);\
    sort2(j,n);sort2(k,o);sort2(l,p);sort2(s,v);sort2(u,x);\
    sort2(w,y);\
    sort2(a,i);sort2(b,j);sort2(c,k);sort2(d,l);sort2(e,m);\
    sort2(f,n);sort2(g,o);sort2(h,p);sort2(q,u);sort2(r,w);\
    sort2(t,y);sort2(v,x);\
    sort2(b,s);sort2(d,v);sort2(f,x);sort2(g,t);sort2(l,o);\
    sort2(p,y);\
    sort2(b,q);sort2(d,r);sort2(g,j);sort2(h,l);sort2(n,t);\
    sort2(o,x);\
    sort2(a,b);sort2(c,q);sort2(d,i);sort2(h,u);sort2(k,n);\
    sort2(l,w);sort2(p,x);\
    sort2(b,c);sort2(f,k);sort2(h,s);sort2(l,v);sort2(p,u);\
    sort2(t,w);\
    sort2(e,h);sort2(f,g);sort2(j,s);sort2(k,r);sort2(l,m);\
    sort2(n,v);sort2(o,p);sort2(t,u);sort2(w,x);\
    sort2(d,e);sort2(h,i);sort2(j,k);sort2(l,q);sort2(m,r);\
    sort2(n,s);sort2(t,v);sort2(u,w);\
    sort2(b,d);sort2(c,e);sort2(f,l);sort2(g,q);sort2(h,j);\
    sort2(i,k);sort2(m,n);sort2(o,t);sort2(p,s);\
    sort2(c,d);sort2(f,h);sort2(g,j);sort2(i,l);sort2(k,q);\
    sort2(m,o);sort2(p,r);\
    sort2(d,f);sort2(e,g);sort2(h,i);sort2(j,l);sort2(k,m);\
    sort2(n,o);sort2(p,q);sort2(r,s);\
    sort2(e,h);sort2(g,i);sort2(j,k);sort2(l,m);sort2(n,p);\
    sort2(o,q);sort2(r,t);sort2(s,v);\
    sort2(e,f);sort2(g,h);sort2(i,j);sort2(k,l);sort2(m,n);\
    sort2(o,p);sort2(q,r);sort2(s,t);sort2(u,v);\
} while(0)

#define sort26(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z) do { \
    sort2(a,b);sort2(c,d);sort2(e,f);sort2(g,h);sort2(i,j);\
    sort2(k,l);sort2(m,n);sort2(o,p);sort2(q,r);sort2(s,t);\
    sort2(u,v);sort2(w,x);sort2(y,z);\
    sort2(a,c);sort2(b,d);sort2(e,g);sort2(f,h);sort2(i,k);\
    sort2(j,l);sort2(o,q);sort2(p,r);sort2(s,u);sort2(t,v);\
    sort2(w,y);sort2(x,z);\
    sort2(a,e);sort2(b,g);sort2(c,f);sort2(d,h);sort2(i,o);\
    sort2(j,q);sort2(k,p);sort2(l,r);sort2(s,w);sort2(t,y);\
    sort2(u,x);sort2(v,z);\
    sort2(a,s);sort2(b,t);sort2(c,u);sort2(d,v);sort2(e,w);\
    sort2(f,x);sort2(g,y);sort2(h,z);sort2(j,m);sort2(n,q);\
    sort2(d,l);sort2(i,j);sort2(k,n);sort2(m,p);sort2(o,w);\
    sort2(q,r);\
    sort2(a,i);sort2(b,j);sort2(c,o);sort2(g,m);sort2(h,p);\
    sort2(k,s);sort2(l,x);sort2(n,t);sort2(q,y);sort2(r,z);\
    sort2(b,c);sort2(d,s);sort2(e,i);sort2(h,w);sort2(r,v);\
    sort2(x,y);\
    sort2(d,o);sort2(e,k);sort2(f,s);sort2(h,u);sort2(i,n);\
    sort2(l,w);sort2(m,r);sort2(p,v);\
    sort2(b,e);sort2(f,g);sort2(h,j);sort2(i,k);sort2(p,r);\
    sort2(q,s);sort2(t,u);sort2(v,y);\
    sort2(c,f);sort2(d,k);sort2(g,o);sort2(j,n);sort2(l,t);\
    sort2(m,q);sort2(p,w);sort2(u,x);\
    sort2(c,i);sort2(f,h);sort2(g,j);sort2(l,m);sort2(n,o);\
    sort2(q,t);sort2(r,x);sort2(s,u);\
    sort2(c,e);sort2(d,f);sort2(g,l);sort2(h,k);sort2(j,q);\
    sort2(m,n);sort2(o,t);sort2(p,s);sort2(u,w);sort2(v,x);\
    sort2(d,e);sort2(f,i);sort2(g,h);sort2(j,l);sort2(k,m);\
    sort2(n,p);sort2(o,q);sort2(r,u);sort2(s,t);sort2(v,w);\
    sort2(f,g);sort2(h,i);sort2(j,k);sort2(l,m);sort2(n,o);\
    sort2(p,q);sort2(r,s);sort2(t,u);\
    sort2(e,f);sort2(g,h);sort2(i,j);sort2(k,l);sort2(m,n);\
    sort2(o,p);sort2(q,r);sort2(s,t);sort2(u,v);\
} while(0)

void sorting_network(TYPE* l, int partsz_min_1) {
    // partsz_min_1 == right - left
    switch (partsz_min_1) {
    case 25:
        sort26(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18], l[19], l[20], l[21], l[22], l[23],
            l[24], l[25]);
        break;
    case 24:
        sort25(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18], l[19], l[20], l[21], l[22], l[23], l[24]);
        break;
    case 23:
        sort24(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18], l[19], l[20], l[21], l[22], l[23]);
        break;
    case 22:
        sort23(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18], l[19], l[20], l[21], l[22]);
        break;
    case 21:
        sort22(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18], l[19], l[20], l[21]);
        break;
    case 20:
        sort21(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18], l[19], l[20]);
        break;
    case 19:
        sort20(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18], l[19]);
        break;
    case 18:
        sort19(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
            l[16], l[17], l[18]);
        break;
    case 17:
        sort18(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15], l[16], l[17]);
        break;
    case 16:
        sort17(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15], l[16]);
        break;
    case 15:
        sort16(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15]);
        break;
    case 14:
        sort15(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13], l[14]);
        break;
    case 13:
        sort14(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12], l[13]);
        break;
    case 12:
        sort13(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11], l[12]);
            break;
    case 11:
        sort12(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11]);
        break;
    case 10:
        sort11(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9], l[10]);
        break;
    case 9:
        sort10(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9]);
        break;
    case 8:
        sort9(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8]);
        break;
    case 7:
        sort8(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7]);
        break;
    case 6:
        sort7(l[0], l[1], l[2], l[3], l[4], l[5], l[6]);
        break;
    case 5:
        sort6(l[0], l[1], l[2], l[3], l[4], l[5]);
        break;
    case 4:
        sort5(l[0], l[1], l[2], l[3], l[4]);
        break;
    case 3:
        sort4(l[0], l[1], l[2], l[3]);
        break;
    case 2:
        sort3(l[0], l[1], l[2]);
        break;
    case 1:
        sort2(l[0], l[1]);
        break;
    case 0:
        break;
    }
}
// ---------------------- partition ----------------------------

#define med5(a, b, c, d, e) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); sort2(c, e); \
    sort2(b, c); \
} while(0)

TYPE* partition_small(TYPE* restrict  left, TYPE* restrict  right) {
    TYPE* outerleft = left;
    TYPE* pivp = left + (right - left) / 2;

    med5(left[1], left[2], *pivp, right[-1], *right);
    left += 3;
    right -= 2;

    TYPE piv = *pivp;
    *pivp = *outerleft;

    TYPE swbuf[SMALLPART];
    TYPE* sw = swbuf;
    TYPE* lwr = left;
    while (left <= right) {
        unsigned h = IS_LOWER(*left, piv); *lwr = *sw = *left++; lwr += h; sw += !h;
    }
    memcpy(lwr, swbuf, (sw - swbuf) * sizeof(TYPE));
    lwr -= 1;
    *outerleft = *lwr;
    *lwr = piv;
    return lwr;
}

#define SWSZ 2048
#define UNROLL 16

TYPE* partition(TYPE* restrict left, TYPE* restrict right) {

    TYPE* outerleft = left;
    TYPE* pivp = left + (right - left) / 2;

    med5(left[3], left[4], left[1], left[5], left[6]);
    med5(left[11], left[12], left[2], left[13], left[14]);
    med5(pivp[-20], pivp[-10], pivp[0], pivp[10], pivp[20]);
    med5(right[-6], right[-7], right[-1], right[-8], right[-9]);
    med5(right[-15], right[-14], right[0], right[-13], right[-12]);
    med5(left[1], left[2], pivp[0], right[-1], right[0]);
    left += 3;
    right -= 2;

    TYPE piv = *pivp;
    *pivp = *outerleft;

    TYPE swbuf[SWSZ];

    TYPE* lwr = left;
    TYPE* rwr = right;

    TYPE* sw = swbuf;
    while (sw < swbuf + SWSZ - UNROLL && left <= right - UNROLL) {
        for (int i = UNROLL; i--;) {
            unsigned h = IS_LOWER(*right, piv); *rwr = *sw = *right--; rwr -= !h; sw += h;
        }
    }
    while (sw < swbuf + SWSZ - UNROLL && left <= right) {
        unsigned h = IS_LOWER(*right, piv); *rwr = *sw = *right--; rwr -= !h; sw += h;
    }
    while (left <= right - UNROLL) {
        while (rwr > right + UNROLL && left <= right - UNROLL) {
            for (int i = UNROLL; i--;) {
                unsigned h = IS_LOWER(*left, piv); *lwr = *rwr = *left++; lwr += h; rwr -= !h;
            }
        }
        while (lwr < left - UNROLL && left <= right - UNROLL) {
            for (int i = UNROLL; i--;) {
                unsigned h = IS_LOWER(*right, piv); *rwr = *lwr = *right--; rwr -= !h; lwr += h;
            }
        }
    }
    while (rwr > right && left <= right) {
        unsigned h = IS_LOWER(*left, piv); *lwr = *rwr = *left++; lwr += h; rwr -= !h;
    }
    while (left <= right) {
        unsigned h = IS_LOWER(*right, piv); *rwr = *lwr = *right--; rwr -= !h; lwr += h;
    }
    memcpy(lwr, swbuf, (sw - swbuf) * sizeof(TYPE));

    *outerleft = *rwr;
    *rwr = piv;
    return rwr;
}

// ---------------------- heap_sort, testing and main -----------------------------------

void heap_sort(TYPE* left, TYPE* right) {

    long n = right - left + 1;
    if (n < 2) return;
    for (long i = n / 2, j; ; ) {
        TYPE k;
        if (i > 0) {
            k = left[--i];
        }
        else {
            n -= 1;
            if (n == 0) return;
            k = left[n];
            left[n] = left[0];
        }
        j = i;
        while (j * 2 + 1 < n) {
            long child = j * 2 + 1;
            if (child + 1 < n && IS_LOWER(left[child], left[child + 1])) child++;
            if (!IS_LOWER(k, left[child])) break;
            left[j] = left[child];
            j = child;
        }
        left[j] = k;
    }
}

unsigned chksum;
unsigned hash_el(void *p, int sz) {
    unsigned char* c = (unsigned char*)p;
    unsigned h = 0;
    for (int i = 0; i < sz; i++) h = (h * 131) + c[i];
    return h;
}
void init(TYPE* data, int len) {
    chksum = 0;
    for (int i = 0; i < len; i++) {
        data[i] = rand();
        chksum += hash_el(&data[i], sizeof(TYPE));
    }
}
void test(TYPE* data, int len) {
    unsigned chks = hash_el(&data[0], sizeof(TYPE));;
    for (int i = 1; i < len; i++) {
        if (IS_LOWER(data[i], data[i - 1])) {
            printf("ERROR ORDER\n");
            return;
        }
        chks += hash_el(&data[i], sizeof(TYPE));
    }
    if (chks != chksum) printf("ERROR CHKS\n");
}
double t(void) {

    static double twr;
    struct timeval tv;
    gettimeofday(&tv, NULL);
    double h = twr;
    twr = tv.tv_sec + tv.tv_usec / 1000000.0;
    return twr - h;
}

#define SIZE (50 * 1000000)
TYPE data[SIZE];

int main(void) {

    srand(time(NULL));
    init(data, SIZE);
    printf(NAME"\n");
    printf("Sorting %d million integers ...\n", SIZE / 1000000);
    t();
    sort(data, SIZE);
    printf("%.2fs\n", t());
    test(data, SIZE);
    return 0;
}
Branch-Avoidant Quicksort Sorting-Network 26
Sorting 50 million integers ...
0.86s


Multithreaded sorting

Interactive sorting demo


christof.kaser@gmail.com