A Fast Quicksort in C for Modern CPUs with Threads and Branch‑Avoidant Coding

Multithreading has become increasingly important with stagnating single‑thread performance and widespread multicore CPUs. The challenge is to divide the work across multiple threads in a way that actually improves performance.

This is not possible for every algorithm. However, Quicksort is a classic divide‑and‑conquer algorithm, which makes it well suited for parallelization, since independent subproblems can be processed concurrently.

On modern CPUs, avoiding branch misprediction is another way to speed up programs. One effective way to reduce branch misprediction is to eliminate branches altogether.

for (int i = 0, j = 0; i < 1000; i++) {
    if (numbers[i] < 500) {
        small_numbers[j] = numbers[i];
        j += 1;
    }
}
If the array contains random numbers, the branch outcome becomes difficult for the predictor to learn, and the misprediction rate is typically high, often approaching 50%.

This can be avoided by eliminating the branching.

for (int i = 0, j = 0; i < 1000; i++) {
    small_numbers[j] = numbers[i];
    j += (numbers[i] < 500);
}
While this version introduces an unconditional store, its cost is typically far lower than that of a branch misprediction.

When ‘if’ slows you down, avoid it

Quicksort in C - Performance Comparison

Performance results naturally depend on the underlying hardware. The following benchmarks show the execution times for sorting 50 million integers using different sorting implementations. The measurements were taken on an Apple M1 system and on an Intel Xeon (E5-2680) system with the -O3 compiler option.

Implementation Apple M1 Intel Xeon
Baseline Quicksort 3.234s 4.953s
std::sort (C++) 1.190s 4.949s
Branch-Avoidant Single-Threaded 0.921s 1.814s
Branch-Avoidant Multi-Threaded 0.243s 0.461s

Baseline Quicksort

This is a simple iterative quicksort written in C using a Lomuto‑style partition.

The pivot is placed directly into its final position during partitioning, and the algorithm always continues with the larger part while storing the smaller one on a manual stack.

// SPDX-License-Identifier: MIT
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int

#define NAME "Baseline Quicksort"

TYPE* partition(TYPE* left, TYPE* right);

void sort(TYPE* data, int len) {

    TYPE* left = data;
    TYPE* stack[80];    // for max 2^40 items
    unsigned sp = 0;
    TYPE* right = data + len - 1;

    while (1) {
        long partsz = right - left;
        if (partsz > 0) {
            TYPE* mid = partition(left, right);
            if (mid < left + partsz / 2) {
                stack[sp] = mid + 1;
                stack[sp + 1] = right;
                right = mid - 1;
            }
            else {
                stack[sp] = left;
                stack[sp + 1] = mid - 1;
                left = mid + 1;
            }
            sp += 2;
        }
        else {
            if (sp == 0) break;
            sp -= 2;
            left = stack[sp];
            right = stack[sp + 1];
        }
    }
}

// ---------------------- partition ----------------------------

#define swap(a, b) do { TYPE _h = a; a = b; b = _h; } while (0)

TYPE* partition(TYPE* left, TYPE* right)
{
    TYPE* outerleft = left;
    TYPE* pivp = left + (right - left) / 2;

    TYPE piv = *pivp;
    *pivp = *outerleft;
    TYPE* mid = left;
    left += 1;

    while (left <= right) {
        if (IS_LOWER(*left, piv)) {
            mid++;
            swap(*mid, *left);
        }
        left++;
    }
    *outerleft = *mid;
    *mid = piv;
    return mid;
}

// ---------------------- testing and main -----------------------------------

unsigned chksum;
unsigned hash_el(void *p, int sz) {
    unsigned char* c = (unsigned char*)p;
    unsigned h = 0;
    for (int i = 0; i < sz; i++) h = (h * 131) + c[i];
    return h;
}
void init(TYPE* data, int len) {
    chksum = 0;
    for (int i = 0; i < len; i++) {
        data[i] = rand();
        chksum += hash_el(&data[i], sizeof(TYPE));
    }
}
void test(TYPE* data, int len) {
    unsigned chks = hash_el(&data[0], sizeof(TYPE));;
    for (int i = 1; i < len; i++) {
        if (IS_LOWER(data[i], data[i - 1])) {
            printf("ERROR ORDER\n");
            return;
        }
        chks += hash_el(&data[i], sizeof(TYPE));
    }
    if (chks != chksum) printf("ERROR CHKS\n");
}
double t(void) {

    static double twr;
    struct timeval tv;
    gettimeofday(&tv, NULL);
    double h = twr;
    twr = tv.tv_sec + tv.tv_usec / 1000000.0;
    return twr - h;
}

#define SIZE (50 * 1000000)
TYPE data[SIZE];

int main(void) {

    srand(time(NULL));
    init(data, SIZE);
    printf(NAME"\n");
    printf("Sorting %d million integers ...\n", SIZE / 1000000);
    t();
    sort(data, SIZE);
    printf("%.3fs\n", t());
    test(data, SIZE);
    return 0;
}
Baseline Quicksort
Sorting 50 million integers ...
3.234s

C++ std::sort

For comparison C++ std::sort:

// SPDX-License-Identifier: MIT
#include <algorithm>
#include <sys/time.h>

#define TYPE int
#define NAME "std::sort"

void sort(TYPE* data, int len) {
    std::sort(data, data + len);
}

void init(TYPE* data, int len) {
    for (int i = 0; i < len; i++) {
        data[i] = rand();
    }
}
double t(void) {
    static double twr;
    struct timeval tv;
    gettimeofday(&tv, NULL);
    double h = twr;
    twr = tv.tv_sec + tv.tv_usec / 1000000.0;
    return twr - h;
}

#define SIZE (50 * 1000000)
TYPE data[SIZE];

int main(void) {

    srand(time(NULL));
    init(data, SIZE);
    printf(NAME"\n");
    printf("Sorting %d million integers ...\n", SIZE / 1000000);
    t();
    sort(data, SIZE);
    printf("%.3fs\n", t());
    test(data, SIZE);
    return 0;
}
std::sort
Sorting 50 million integers ...
1.190s

Branch-Avoidant Single-Threaded

The paper https://arxiv.org/abs/1604.06697 by Edelkamp and A. Weiß shows how partitioning performance in Quicksort can be improved by avoiding conditional branches.

The idea of using auxiliary memory for branchless partitioning is inspired by https://github.com/scandum/fluxsort .

To avoid the O(n²) runtime caused by many duplicates, the program switches to heap sort for a segment if it detects a significant bias in the partitioning.

Small ranges are finished using a sorting network.

// SPDX-License-Identifier: MIT
// (c) 2026 christof.kaser@gmail.com
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#include <stddef.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int

#define NAME "Branch-Avoidant Quicksort Sorting-Network 12"

TYPE* partition(TYPE* left, TYPE* right);
void sorting_network(TYPE* left, int size);
void heap_sort(TYPE* left, TYPE* right);
TYPE* partition_small(TYPE* left, TYPE* right);

#define SMALLPART 256

void sort(TYPE* data, int len) {

    TYPE* left = data;
    TYPE* stack[80];    // for max 2^40 items
    unsigned sp = 0;
    TYPE* right = data + len - 1;

    while (1) {
        ptrdiff_t partsz = right - left;
        TYPE* mid;
        if (partsz <= SMALLPART) {
            if (partsz <= 11) {
                sorting_network(left, partsz);
                goto pop_stack;
            }
            mid = partition_small(left, right);
        }
        else  {
            mid = partition(left, right);
            if ((mid - left) * 16 < partsz) {
                heap_sort(left, right);
                goto pop_stack;
            }
        }
        if (mid < left + partsz / 2) {
            stack[sp] = mid + 1;
            stack[sp + 1] = right;
            right = mid - 1;
        }
        else {
            stack[sp] = left;
            stack[sp + 1] = mid - 1;
            left = mid + 1;
        }
        sp += 2;
        continue;

    pop_stack:
        if (sp == 0) break;
        sp -= 2;
        left = stack[sp];
        right = stack[sp + 1];
    }
}

// ---------------------- sorting network ----------------------------

#define sort2(a, b) do {  \
    TYPE x = a; \
    TYPE y = b; \
    unsigned m = IS_LOWER(x, y); \
    a = m ? x : y;  \
    b = m ? y : x;  \
} while(0)

// https://bertdobbelaere.github.io/sorting_networks.html

#define sort3(a,b,c) do { \
    sort2(a,c);\
    sort2(a,b);\
    sort2(b,c);\
} while(0)

#define sort4(a,b,c,d) do { \
    sort2(a,c);sort2(b,d);\
    sort2(a,b);sort2(c,d);\
    sort2(b,c);\
} while(0)

#define sort5(a,b,c,d,e) do { \
    sort2(a,d);sort2(b,e);\
    sort2(a,c);sort2(b,d);\
    sort2(a,b);sort2(c,e);\
    sort2(b,c);sort2(d,e);\
    sort2(c,d);\
} while(0)

#define sort6(a,b,c,d,e,f) do { \
    sort2(a,f);sort2(b,d);sort2(c,e);\
    sort2(b,c);sort2(d,e);\
    sort2(a,d);sort2(c,f);\
    sort2(a,b);sort2(c,d);sort2(e,f);\
    sort2(b,c);sort2(d,e);\
} while(0)

#define sort7(a,b,c,d,e,f,g) do { \
    sort2(a,g);sort2(c,d);sort2(e,f);\
    sort2(a,c);sort2(b,e);sort2(d,g);\
    sort2(a,b);sort2(c,f);sort2(d,e);\
    sort2(b,c);sort2(e,g);\
    sort2(c,d);sort2(e,f);\
    sort2(b,c);sort2(d,e);sort2(f,g);\
} while(0)

#define sort8(a,b,c,d,e,f,g,h) do { \
    sort2(a,c);sort2(b,d);sort2(e,g);sort2(f,h);\
    sort2(a,e);sort2(b,f);sort2(c,g);sort2(d,h);\
    sort2(a,b);sort2(c,d);sort2(e,f);sort2(g,h);\
    sort2(c,e);sort2(d,f);\
    sort2(b,e);sort2(d,g);\
    sort2(b,c);sort2(d,e);sort2(f,g);\
} while (0)

#define sort9(a,b,c,d,e,f,g,h,i) do { \
    sort2(a,d);sort2(b,h);sort2(c,f);sort2(e,i);\
    sort2(a,h);sort2(c,e);sort2(d,i);sort2(f,g);\
    sort2(a,c);sort2(b,d);sort2(e,f);sort2(h,i);\
    sort2(b,e);sort2(d,g);sort2(f,h);\
    sort2(a,b);sort2(c,e);sort2(d,f);sort2(g,i);\
    sort2(c,d);sort2(e,f);sort2(g,h);\
    sort2(b,c);sort2(d,e);sort2(f,g);\
} while (0)

#define sort10(a,b,c,d,e,f,g,h,i,j) do { \
    sort2(a,i);sort2(b,j);sort2(c,h);sort2(d,f);sort2(e,g);\
    sort2(a,c);sort2(b,e);sort2(f,i);sort2(h,j);\
    sort2(a,d);sort2(c,e);sort2(f,h);sort2(g,j);\
    sort2(a,b);sort2(d,g);sort2(i,j);\
    sort2(b,f);sort2(c,d);sort2(e,i);sort2(g,h);\
    sort2(b,c);sort2(d,f);sort2(e,g);sort2(h,i);\
    sort2(c,d);sort2(e,f);sort2(g,h);\
    sort2(d,e);sort2(f,g);\
} while(0)

#define sort11(a,b,c,d,e,f,g,h,i,j,k) do { \
    sort2(a,j);sort2(b,g);sort2(c,e);sort2(d,h);sort2(f,i);\
    sort2(a,b);sort2(d,f);sort2(e,k);sort2(g,j);sort2(h,i);\
    sort2(b,d);sort2(c,f);sort2(e,h);sort2(i,k);\
    sort2(a,e);sort2(b,c);sort2(d,h);sort2(f,j);sort2(g,i);\
    sort2(a,b);sort2(c,g);sort2(e,f);sort2(h,i);sort2(j,k);\
    sort2(c,e);sort2(d,g);sort2(f,h);sort2(i,j);\
    sort2(b,c);sort2(d,e);sort2(f,g);sort2(h,i);\
    sort2(c,d);sort2(e,f);sort2(g,h);\
} while (0)

#define sort12(a,b,c,d,e,f,g,h,i,j,k,l) do { \
    sort2(a,i);sort2(b,h);sort2(c,g);sort2(d,l);sort2(e,k);sort2(f,j);\
    sort2(a,b);sort2(c,f);sort2(d,e);sort2(g,j);sort2(h,i);sort2(k,l);\
    sort2(a,c);sort2(b,g);sort2(f,k);sort2(j,l);\
    sort2(a,d);sort2(b,c);sort2(e,g);sort2(f,h);sort2(i,l);sort2(j,k);\
    sort2(b,e);sort2(d,f);sort2(g,i);sort2(h,k);\
    sort2(b,d);sort2(c,f);sort2(g,j);sort2(i,k);\
    sort2(c,d);sort2(e,f);sort2(g,h);sort2(i,j);\
    sort2(e,g);sort2(f,h);\
    sort2(d,e);sort2(f,g);sort2(h,i);\
} while (0)

void sorting_network(TYPE* l, int partsz_min_1) {
    // partsz_min_1 == right - left
    switch (partsz_min_1) {
    case 11:
        sort12(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11]);
        break;
    case 10:
        sort11(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9], l[10]);
        break;
    case 9:
        sort10(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9]);
        break;
    case 8:
        sort9(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8]);
        break;
    case 7:
        sort8(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7]);
        break;
    case 6:
        sort7(l[0], l[1], l[2], l[3], l[4], l[5], l[6]);
        break;
    case 5:
        sort6(l[0], l[1], l[2], l[3], l[4], l[5]);
        break;
    case 4:
        sort5(l[0], l[1], l[2], l[3], l[4]);
        break;
    case 3:
        sort4(l[0], l[1], l[2], l[3]);
        break;
    case 2:
        sort3(l[0], l[1], l[2]);
        break;
    case 1:
        sort2(l[0], l[1]);
        break;
    case 0:
        break;
    }
}
// ---------------------- partition ----------------------------

#define med5(a, b, c, d, e) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); sort2(c, e); \
    sort2(b, c); \
} while(0)

TYPE* partition_small(TYPE* restrict  left, TYPE* restrict  right) {
    TYPE* outerleft = left;
    TYPE* pivp = left + (right - left) / 2;

    med5(left[1], left[2], *pivp, right[-1], *right);
    left += 3;
    right -= 2;

    TYPE piv = *pivp;
    *pivp = *outerleft;

    TYPE swbuf[SMALLPART];
    TYPE* sw = swbuf;
    TYPE* lwr = left;
    while (left <= right) {
        unsigned h = IS_LOWER(*left, piv); *lwr = *sw = *left++; lwr += h; sw += !h;
    }
    memcpy(lwr, swbuf, (sw - swbuf) * sizeof(TYPE));
    lwr -= 1;
    *outerleft = *lwr;
    *lwr = piv;
    return lwr;
}

#define SWSZ 2048
#define UNROLL 16

TYPE* partition(TYPE* restrict left, TYPE* restrict right) {

    TYPE* outerleft = left;
    TYPE* pivp = left + (right - left) / 2;

    med5(left[3], left[4], left[1], left[5], left[6]);
    med5(left[11], left[12], left[2], left[13], left[14]);
    med5(pivp[-20], pivp[-10], pivp[0], pivp[10], pivp[20]);
    med5(right[-6], right[-7], right[-1], right[-8], right[-9]);
    med5(right[-15], right[-14], right[0], right[-13], right[-12]);
    med5(left[1], left[2], pivp[0], right[-1], right[0]);
    left += 3;
    right -= 2;

    TYPE piv = *pivp;
    *pivp = *outerleft;

    TYPE swbuf[SWSZ];

    TYPE* lwr = left;
    TYPE* rwr = right;

    TYPE* sw = swbuf;
    while (sw < swbuf + SWSZ - UNROLL && left <= right - UNROLL) {
        for (int i = UNROLL; i--;) {
            unsigned h = IS_LOWER(*right, piv); *rwr = *sw = *right--; rwr -= !h; sw += h;
        }
    }
    while (sw < swbuf + SWSZ - UNROLL && left <= right) {
        unsigned h = IS_LOWER(*right, piv); *rwr = *sw = *right--; rwr -= !h; sw += h;
    }
    while (left <= right - UNROLL) {
        while (rwr > right + UNROLL && left <= right - UNROLL) {
            for (int i = UNROLL; i--;) {
                unsigned h = IS_LOWER(*left, piv); *lwr = *rwr = *left++; lwr += h; rwr -= !h;
            }
        }
        while (lwr < left - UNROLL && left <= right - UNROLL) {
            for (int i = UNROLL; i--;) {
                unsigned h = IS_LOWER(*right, piv); *rwr = *lwr = *right--; rwr -= !h; lwr += h;
            }
        }
    }
    while (rwr > right && left <= right) {
        unsigned h = IS_LOWER(*left, piv); *lwr = *rwr = *left++; lwr += h; rwr -= !h;
    }
    while (left <= right) {
        unsigned h = IS_LOWER(*right, piv); *rwr = *lwr = *right--; rwr -= !h; lwr += h;
    }
    memcpy(lwr, swbuf, (sw - swbuf) * sizeof(TYPE));

    *outerleft = *rwr;
    *rwr = piv;
    return rwr;
}

// ---------------------- heap_sort, testing and main -----------------------------------

void heap_sort(TYPE* left, TYPE* right) {

    long n = right - left + 1;
    if (n < 2) return;
    for (long i = n / 2, j; ; ) {
        TYPE k;
        if (i > 0) {
            k = left[--i];
        }
        else {
            n -= 1;
            if (n == 0) return;
            k = left[n];
            left[n] = left[0];
        }
        j = i;
        while (j * 2 + 1 < n) {
            long child = j * 2 + 1;
            if (child + 1 < n && IS_LOWER(left[child], left[child + 1])) child++;
            if (!IS_LOWER(k, left[child])) break;
            left[j] = left[child];
            j = child;
        }
        left[j] = k;
    }
}

unsigned chksum;
unsigned hash_el(void *p, int sz) {
    unsigned char* c = (unsigned char*)p;
    unsigned h = 0;
    for (int i = 0; i < sz; i++) h = (h * 131) + c[i];
    return h;
}
void init(TYPE* data, int len) {
    chksum = 0;
    for (int i = 0; i < len; i++) {
        data[i] = rand();
        chksum += hash_el(&data[i], sizeof(TYPE));
    }
}
void test(TYPE* data, int len) {
    unsigned chks = hash_el(&data[0], sizeof(TYPE));;
    for (int i = 1; i < len; i++) {
        if (IS_LOWER(data[i], data[i - 1])) {
            printf("ERROR ORDER\n");
            return;
        }
        chks += hash_el(&data[i], sizeof(TYPE));
    }
    if (chks != chksum) printf("ERROR CHKS\n");
}
double t(void) {

    static double twr;
    struct timeval tv;
    gettimeofday(&tv, NULL);
    double h = twr;
    twr = tv.tv_sec + tv.tv_usec / 1000000.0;
    return twr - h;
}

#define SIZE (50 * 1000000)
TYPE data[SIZE];

int main(void) {
    srand(time(NULL));
    init(data, SIZE);
    printf(NAME"\n");
    printf("Sorting %d million integers ...\n", SIZE / 1000000);
    t();
    sort(data, SIZE);
    printf("%.3fs\n", t());
    test(data, SIZE);

    return 0;
}
Branch-Avoidant Quicksort Sorting-Network 12
Sorting 50 million integers ...
0.921s

Branch-Avoidant Multi-Threaded

The algorithm is extended to use multiple CPU cores.

New threads are created only for large partitions and are limited so that we don’t start too many threads. Smaller partitions are processed in the current thread.

// SPDX-License-Identifier: MIT
// (c) 2026 christof.kaser@gmail.com
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#include <pthread.h>
#include <unistd.h>
#include <stdatomic.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int

#define NAME "Threaded Branchless Quicksort"

void heap_sort(TYPE* left, TYPE* right);
void sorting_network(TYPE* left, int size);
TYPE* partition(TYPE* left, TYPE* right);
TYPE* partition_small(TYPE* left, TYPE* right);

#define SMALLPART 256

int max_threads;
atomic_int n_threads;

pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;

void* sort_thr(void *arg) {

    TYPE** stack = (TYPE**)arg;
    int sp = 0;
    TYPE* left = stack[sp];
    TYPE* right = stack[sp + 1];

    while (1) {
        int partsz = right - left;
        if (partsz <= 11) {
            sorting_network(left, partsz);
            goto pop_stack;
        }
        TYPE* l;
        TYPE* r;

        TYPE* mid;
        if (partsz <= SMALLPART) mid = partition_small(left, right);
        else mid = partition(left, right);

        if (partsz > SMALLPART && mid - left < 100) {
            heap_sort(left, right);
            goto pop_stack;
        }

        if (mid < left + partsz / 2) {
            l = mid + 1;
            r = right;
            right = mid - 1;
        }
        else {
            l = left;
            r = mid - 1;
            left = mid + 1;
        }
        // soft limit: may exceed max_threads
        TYPE** stack_new = NULL;
        if (r - l > 1000000 && n_threads < max_threads) {
            // start a new thread - max_threads is a soft limit
            stack_new = malloc(64 * sizeof(TYPE*));
            // stack size 64 is sufficient for n <= 2^32
            if (stack_new) {
                stack_new[0] = l;
                stack_new[1] = r;

                pthread_mutex_lock(&mtx);
                pthread_t thread;
                if (pthread_create(&thread, NULL, sort_thr, stack_new) == 0) {
                    n_threads += 1;
                    pthread_detach(thread);
                }
                else {
                    free(stack_new);
                    stack_new = NULL;
                }
                pthread_mutex_unlock(&mtx);
            }
        }
        if (stack_new == NULL) {
            stack[sp] = l;
            stack[sp + 1] = r;
            sp += 2;
        }
        continue;

    pop_stack:
        if (sp == 0) break;
        sp -= 2;
        left = stack[sp];
        right = stack[sp + 1];
    }

    free(stack);
    pthread_mutex_lock(&mtx);
    n_threads -= 1;
    if (n_threads == 0) pthread_cond_signal(&cond);
    pthread_mutex_unlock(&mtx);

    return NULL;
}

void sort(TYPE* data, int len) {

    int n_cpus = sysconf(_SC_NPROCESSORS_ONLN);
    // printf("CPUs: %d\n", n_cpus);
    if (n_cpus > 0) max_threads = n_cpus * 2;
    else max_threads = 8;

    pthread_t thread;
    TYPE** stack = malloc(64 * sizeof(TYPE*));
    stack[0] = data;
    stack[1] = data + len - 1;
    n_threads = 1;
    pthread_create(&thread, NULL, sort_thr, stack);

    pthread_mutex_lock(&mtx);
    while (n_threads != 0) pthread_cond_wait(&cond, &mtx);
    pthread_mutex_unlock(&mtx);
}

// ---------------------- sorting network ----------------------------

#define sort2(a, b) do {  \
    TYPE x = a; \
    TYPE y = b; \
    unsigned m = IS_LOWER(x, y); \
    a = m ? x : y;  \
    b = m ? y : x;  \
} while(0)

// https://bertdobbelaere.github.io/sorting_networks.html

#define sort3(a, b, c) do { \
    sort2(a, b); sort2(b, c); sort2(a, b); \
} while(0)

#define sort4(a, b, c, d) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); \
} while(0)

#define sort5(a, b, c, d, e) do { \
    sort2(b, c); sort2(d, e); sort2(b, d); \
    sort2(a, c); sort2(a, d); sort2(c, e); \
    sort2(a, b); sort2(c, d); sort2(b, c); \
} while(0)

#define sort6(a, b, c, d, e, f) do { \
    sort2(a, b); sort2(c, d); sort2(e, f); \
    sort2(a, c); sort2(b, d); sort2(e, f); \
    sort2(a, e); sort2(b, f); sort2(c, e); \
    sort2(d, f); sort2(b, c); sort2(d, e); \
    sort2(c, d); \
} while(0)

#define sort7(a, b, c, d, e, f, g) do { \
    sort2(a, g); sort2(c, d); sort2(e, f); \
    sort2(a, c); sort2(b, e); sort2(d, g); \
    sort2(a, b); sort2(c, f); sort2(d, e); \
    sort2(b, c); sort2(e, g); \
    sort2(c, d); sort2(e, f); \
    sort2(b, c); sort2(d, e); sort2(f, g); \
} while(0)

#define sort8(a,b,c,d,e,f,g,h) do { \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); \
    sort2(b,c); sort2(f,g); \
    sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
    sort2(c,e); sort2(d,f); \
    sort2(b,c); sort2(d,e); sort2(f,g); \
} while (0)

#define sort9(a,b,c,d,e,f,g,h,i) do { \
    sort2(a,d); sort2(b,h); sort2(c,f); sort2(e,i); \
    sort2(a,h); sort2(c,e); sort2(d,i); sort2(f,g); \
    sort2(a,c); sort2(b,d); sort2(e,f); sort2(h,i); \
    sort2(b,e); sort2(d,g); sort2(f,h); \
    sort2(a,b); sort2(c,e); sort2(d,f); sort2(g,i); \
    sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(b,c); sort2(d,e); sort2(f,g); \
} while (0)

#define sort10(a, b, c, d, e, f, g, h, i, j) do { \
    sort2(a, b); sort2(c, f); sort2(d, g); sort2(e, h); sort2(i, j); \
    sort2(a, g); sort2(b, i); sort2(c, e); sort2(d, j); sort2(f, h); \
    sort2(a, c); sort2(b, d); sort2(e, f); sort2(g, i); sort2(h, j); \
    sort2(a, b); sort2(c, h); sort2(d, f); sort2(e, g); sort2(i, j); \
    sort2(b, c); sort2(d, e); sort2(f, g); sort2(h, i); \
    sort2(b, d); sort2(c, e); sort2(f, h); sort2(g, i); \
    sort2(c, d); sort2(e, f); sort2(g, h); \
} while(0)

#define sort11(a,b,c,d,e,f,g,h,i,j,k) do { \
    sort2(a,j); sort2(b,g); sort2(c,e); sort2(d,h); sort2(f,i); \
    sort2(a,b); sort2(d,f); sort2(e,k); sort2(g,j); sort2(h,i); \
    sort2(b,d); sort2(c,f); sort2(e,h); sort2(i,k); \
    sort2(a,e); sort2(b,c); sort2(d,h); sort2(f,j); sort2(g,i); \
    sort2(a,b); sort2(c,g); sort2(e,f); sort2(h,i); sort2(j,k); \
    sort2(c,e); sort2(d,g); sort2(f,h); sort2(i,j); \
    sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); \
    sort2(c,d); sort2(e,f); sort2(g,h); \
} while (0)

#define sort12(a,b,c,d,e,f,g,h,i,j,k,l) do { \
    sort2(a,i); sort2(b,h); sort2(c,g); sort2(d,l); sort2(e,k); sort2(f,j); \
    sort2(a,c); sort2(b,e); sort2(d,f); sort2(g,i); sort2(h,k); sort2(j,l); \
    sort2(a,b); sort2(c,j); sort2(e,h); sort2(f,g); sort2(k,l); \
    sort2(b,d); sort2(c,h); sort2(e,j); sort2(i,k); \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
    sort2(b,c); sort2(d,f); sort2(g,i); sort2(j,k); \
    sort2(c,e); sort2(d,g); sort2(f,i); sort2(h,j); \
    sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
} while (0)

void sorting_network(TYPE* l, int partsz_min_1) {
    // partsz_min_1 == right - left
    switch (partsz_min_1) {
    case 11:
        sort12(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
            l[8], l[9], l[10], l[11]);
        break;
    case 10:
        sort11(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9], l[10]);
        break;
    case 9:
        sort10(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9]);
        break;
    case 8:
        sort9(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8]);
        break;
    case 7:
        sort8(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7]);
        break;
    case 6:
        sort7(l[0], l[1], l[2], l[3], l[4], l[5], l[6]);
        break;
    case 5:
        sort6(l[0], l[1], l[2], l[3], l[4], l[5]);
        break;
    case 4:
        sort5(l[0], l[1], l[2], l[3], l[4]);
        break;
    case 3:
        sort4(l[0], l[1], l[2], l[3]);
        break;
    case 2:
        sort3(l[0], l[1], l[2]);
        break;
    case 1:
        sort2(l[0], l[1]);
        break;
    case 0:
        break;
    }
}
// ---------------------- partition ----------------------------

#define med5(a, b, c, d, e) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); sort2(c, e); \
    sort2(b, c); \
} while(0)

TYPE* partition_small(TYPE* restrict  left, TYPE* restrict  right) {
    TYPE* outerleft = left;
    TYPE* pivp = left + (right - left) / 2;

    med5(left[1], left[2], *pivp, right[-1], *right);
    left += 3;
    right -= 2;

    TYPE piv = *pivp;
    *pivp = *outerleft;

    TYPE swbuf[SMALLPART];
    TYPE* sw = swbuf;
    TYPE* lwr = left;
    while (left <= right) {
        unsigned h = IS_LOWER(*left, piv); *lwr = *sw = *left++; lwr += h; sw += !h;
    }
    memcpy(lwr, swbuf, (sw - swbuf) * sizeof(TYPE));
    lwr -= 1;
    *outerleft = *lwr;
    *lwr = piv;
    return lwr;
}

#define SWSZ 2048
#define UNROLL 16

TYPE* partition(TYPE* restrict left, TYPE* restrict right) {

    TYPE* outerleft = left;
    TYPE* pivp = left + (right - left) / 2;

    med5(left[3], left[4], left[1], left[5], left[6]);
    med5(left[11], left[12], left[2], left[13], left[14]);
    med5(pivp[-20], pivp[-10], pivp[0], pivp[10], pivp[20]);
    med5(right[-6], right[-7], right[-1], right[-8], right[-9]);
    med5(right[-15], right[-14], right[0], right[-13], right[-12]);
    med5(left[1], left[2], pivp[0], right[-1], right[0]);
    left += 3;
    right -= 2;

    TYPE piv = *pivp;
    *pivp = *outerleft;

    TYPE swbuf[SWSZ];

    TYPE* lwr = left;
    TYPE* rwr = right;

    TYPE* sw = swbuf;
    while (sw < swbuf + SWSZ - UNROLL && left <= right - UNROLL) {
        for (int i = UNROLL; i--;) {
            unsigned h = IS_LOWER(*right, piv); *rwr = *sw = *right--; rwr -= !h; sw += h;
        }
    }
    while (sw < swbuf + SWSZ - UNROLL && left <= right) {
        unsigned h = IS_LOWER(*right, piv); *rwr = *sw = *right--; rwr -= !h; sw += h;
    }
    while (left <= right - UNROLL) {
        while (rwr > right + UNROLL && left <= right - UNROLL) {
            for (int i = UNROLL; i--;) {
                unsigned h = IS_LOWER(*left, piv); *lwr = *rwr = *left++; lwr += h; rwr -= !h;
            }
        }
        while (lwr < left - UNROLL && left <= right - UNROLL) {
            for (int i = UNROLL; i--;) {
                unsigned h = IS_LOWER(*right, piv); *rwr = *lwr = *right--; rwr -= !h; lwr += h;
            }
        }
    }
    while (rwr > right && left <= right) {
        unsigned h = IS_LOWER(*left, piv); *lwr = *rwr = *left++; lwr += h; rwr -= !h;
    }
    while (left <= right) {
        unsigned h = IS_LOWER(*right, piv); *rwr = *lwr = *right--; rwr -= !h; lwr += h;
    }
    memcpy(lwr, swbuf, (sw - swbuf) * sizeof(TYPE));

    *outerleft = *rwr;
    *rwr = piv;
    return rwr;
}

// ---------------------- heap_sort, testing and main -----------------------------------

void heap_sort(TYPE* left, TYPE* right) {

    long n = right - left + 1;
    if (n < 2) return;
    for (long i = n / 2, j; ; ) {
        TYPE k;
        if (i > 0) {
            k = left[--i];
        }
        else {
            n -= 1;
            if (n == 0) return;
            k = left[n];
            left[n] = left[0];
        }
        j = i;
        while (j * 2 + 1 < n) {
            long child = j * 2 + 1;
            if (child + 1 < n && IS_LOWER(left[child], left[child + 1])) child++;
            if (!IS_LOWER(k, left[child])) break;
            left[j] = left[child];
            j = child;
        }
        left[j] = k;
    }
}

unsigned chksum;
unsigned hash_el(void *p, int sz) {
    unsigned char* c = (unsigned char*)p;
    unsigned h = 0;
    for (int i = 0; i < sz; i++) h = (h * 131) + c[i];
    return h;
}
void init(TYPE* data, int len) {
    chksum = 0;
    for (int i = 0; i < len; i++) {
        data[i] = rand();
        chksum += hash_el(&data[i], sizeof(TYPE));
    }
}
void test(TYPE* data, int len) {
    unsigned chks = hash_el(&data[0], sizeof(TYPE));;
    for (int i = 1; i < len; i++) {
        if (IS_LOWER(data[i], data[i - 1])) {
            printf("ERROR ORDER\n");
            return;
        }
        chks += hash_el(&data[i], sizeof(TYPE));
    }
    if (chks != chksum) printf("ERROR CHKS\n");
}
double t(void) {

    static double twr;
    struct timeval tv;
    gettimeofday(&tv, NULL);
    double h = twr;
    twr = tv.tv_sec + tv.tv_usec / 1000000.0;
    return twr - h;
}

#define SIZE (50 * 1000000)
TYPE data[SIZE];

int main(void) {

    srand(time(NULL));
    init(data, SIZE);
    printf(NAME"\n");
    printf("Sorting %d million integers ...\n", SIZE / 1000000);
    t();
    sort(data, SIZE);
    printf("%.3fs\n", t());
    test(data, SIZE);
    return 0;
}
Threaded Branchless Quicksort
Sorting 50 million numbers integers ...
0.243s
Multithreading and the avoidance of branch mispredictions can significantly improve the application performance of programs.


Interactive sorting demo


christof.kaser@gmail.com