Make Quicksort Fast Using Threads and Avoiding Branch Misprediction

Multithreading has become increasingly important with stagnating single‑thread performance and widespread multicore CPUs. The challenge is to divide the work across multiple threads in a way that actually improves performance.

This is not possible for every algorithm. However, Quicksort is a classic divide‑and‑conquer algorithm, which makes it well suited for parallelization, since independent subproblems can be processed concurrently.

Avoiding branch misprediction is another way to speed up programs. One effective way to reduce branch misprediction is to eliminate the branches altogether.

for (int i = 0, j = 0; i < 1000; i++) {
    if (numbers[i] < 500) {
        small_numbers[j] = numbers[i];
        j += 1;
    }
}
If the array contains random numbers, the branch outcome becomes difficult for the predictor to learn, and the misprediction rate is typically high, often approaching 50%.

This can be avoided by eliminating the branching.

for (int i = 0, j = 0; i < 1000; i++) {
    small_numbers[j] = numbers[i];
    j += (numbers[i] < 500);
}

Quicksort in C

Performance results naturally depend on the hardware. The following measurements were taken on an M1 MacBook Air.

The following implementations can be pushed into an O(n²) runtime by intentionally chosen inputs. This can be avoided by shuffling the data before sorting.

Single-Threaded

This is a simple iterative quicksort written in C using a Lomuto‑style partition.

The pivot is placed directly into its final position during partitioning, and the algorithm always continues with the larger part while storing the smaller one on a manual stack.

Small ranges are finished using insertion sort. A global sentinel simplifies insertion sort by eliminating explicit boundary checks.

// SPDX-License-Identifier: MIT
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int

#define INSERTSORT_THRE 32
#define swap(a, b) do { __typeof__(a) _h = a; a = b; b = _h; } while (0)

TYPE* partition(TYPE* left, TYPE* right);
void insert_sort(TYPE* left, TYPE* right);

void sort(TYPE* data, int len) {

    for (TYPE* p = data + 1; p < data + len; p++) {
        if (*p < *data) swap(*p, *data);
    }
    TYPE* stack[64];    // for max 2^32 items
    unsigned sp = 0;
    TYPE* left = data + 1;
    TYPE* right = data + len - 1;

    while (1) {
        long partsz = right - left;
        if (partsz < INSERTSORT_THRE) {
            insert_sort(left, right);
            if (sp == 0) break;
            sp -= 2;
            left = stack[sp];
            right = stack[sp + 1];
        }
        else {
            TYPE* mid;
            mid = partition(left, right);
            if (mid < left + partsz / 2) {
                stack[sp] = mid + 1;
                stack[sp + 1] = right;
                right = mid - 1;
            }
            else {
                stack[sp] = left;
                stack[sp + 1] = mid - 1;
                left = mid + 1;
            }
            sp += 2;
        }
    }
}

void insert_sort(TYPE* left, TYPE* right) {
    for (TYPE* pi = left + 1; pi <= right; pi++) {
        TYPE h = *pi;
        TYPE* pj = pi - 1;
        while (IS_LOWER(h, *pj)) {
            *(pj + 1) = *pj;
            pj -= 1;
        }
        *(pj + 1) = h;
    }
}

TYPE* partition(TYPE* left, TYPE* right)
{
    TYPE* outerleft = left;
    TYPE* pivp = left + (right - left) / 2;

    TYPE piv = *pivp;
    *pivp = *outerleft;

    TYPE* store = left;

    for (TYPE* p = left + 1; p <= right; p++) {
        if (*p < piv) {
            store++;
            swap(*store, *p);
        }
    }
    *outerleft = *store;
    *store = piv;
    return store;
}

double t(void) {
    static double t0;
    struct timeval tv;
    gettimeofday(&tv, NULL);
    double h = t0;
    t0 = tv.tv_sec + tv.tv_usec / 1000000.0;
    return t0 - h;
}

unsigned chksum;

void init(TYPE* data, int len) {
    chksum = 0;
    for (int i = 0; i < len; i++) {
        data[i] = rand() / 128;
        chksum += *(unsigned*)&data[i];
    }
}

void test(TYPE* data, int len) {
    unsigned chks = data[0];
    for (int i = 1; i < len; i++) {
        if (data[i] < data[i - 1]) {
            printf("ERROR ORDER\n");
            break;
        }
        chks += *(unsigned*)&data[i];
    }
    if (chks != chksum) printf("ERROR CHKS\n");
}

#define SIZE (50 * 1000000)

TYPE data[SIZE];

int main(void) {

    init(data, SIZE);
    printf("Sorting %d million numbers with Quicksort ...\n",
        SIZE / 1000000);
    t();
    sort(data, SIZE);
    printf("%.3fs\n", t());
    test(data, SIZE);
    return 0;
}
Sorting 50 million numbers with Quicksort ...
3.350s

C++ std::sort

For comparison C++ std::sort:

.
.
#include <algorithm>

void sort(int* data, int len) {
  std::sort(data, data + len);
}
.
.
Sorting 50 million numbers with std::sort ...
1.190s

Single-Threaded Branchless

The paper https://arxiv.org/abs/1604.06697 by Edelkamp and A. Weiß shows how partitioning performance in Quicksort can be improved by avoiding conditional branches.

The idea of using auxiliary memory for branchless partitioning is inspired by https://github.com/scandum/fluxsort .

The pivot is chosen to avoid bad cases, and small parts are sorted using insertion sort and small sorting networks.

To avoid the O(n²) trap with duplicates, a simple heuristic detects extreme partition imbalance and the segment switches to Heapsort.

// SPDX-License-Identifier: MIT
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int
#define swap(a, b) do { __typeof__(a) _h = a; a = b; b = _h; } while (0)

#define INSERTSORT_THRE 16
#define SMALLPART 2048

TYPE* partition_small(TYPE* left, TYPE* right);
TYPE* partition(TYPE* left, TYPE* right);
void insert_sort(TYPE* left, TYPE* right);
void heap_sort(TYPE* left, TYPE* right);

void sort(TYPE* data, int len) {

    for (TYPE* p = data + 1; p < data + len; p++) {
        if (*p < *data) swap(*p, *data);
    }
    TYPE* left = data + 1;
    TYPE* stack[64];    // for max 2^32 items
    unsigned sp = 0;
    TYPE* right = data + len - 1;

    while (1) {
        long partsz = right - left;
        if (partsz < INSERTSORT_THRE) {
            insert_sort(left, right);
            goto pop_stack;
        }
        TYPE* mid;
        if (partsz <= SMALLPART) mid = partition_small(left, right);
        else mid = partition(left, right);

        if (partsz > SMALLPART && mid - left < 100) {
            heap_sort(left, right);
            goto pop_stack;
        }
        if (mid < left + partsz / 2) {
            stack[sp] = mid + 1;
            stack[sp + 1] = right;
            right = mid - 1;
        }
        else {
            stack[sp] = left;
            stack[sp + 1] = mid - 1;
            left = mid + 1;
        }
        sp += 2;
        continue;

    pop_stack:
        if (sp == 0) break;
        sp -= 2;
        left = stack[sp];
        right = stack[sp + 1];
    }
}

void heap_sort(TYPE* left, TYPE* right) {
    long n = right - left + 1;
    if (n < 2) return;
    for (long i = n / 2, j, k; ; ) {
        if (i > 0) {
            k = left[--i];
        }
        else {
            n -= 1;
            if (n == 0) return;
            k = left[n];
            left[n] = left[0];
        }
        j = i;
        while (j * 2 + 1 < n) {
            long child = j * 2 + 1;
            if (child + 1 < n && IS_LOWER(left[child], left[child + 1])) child++;
            if (!IS_LOWER(k, left[child])) break;
            left[j] = left[child];
            j = child;
        }
        left[j] = k;
    }
}

#define sort2(a, b) do { \
    if (IS_LOWER(b, a)) swap(a, b); \
} while(0)              \

#define sort3(a, b, c) do { \
    sort2(a, b);        \
    sort2(b, c);        \
    sort2(a, b);        \
} while(0)              \

#define sort4(a, b, c, d) do { \
    sort2(a, b);        \
    sort2(c, d);        \
    sort2(a, c);        \
    sort2(b, d);        \
    sort2(b, c);        \
} while(0)              \

#define sort5(a, b, c, d, e) do { \
    sort2(b, c);        \
    sort2(d, e);        \
    sort2(b, d);        \
    sort2(a, c);        \
    sort2(a, d);        \
    sort2(c, e);        \
    sort2(a, b);        \
    sort2(c, d);        \
    sort2(b, c);        \
} while(0)              \

void insert_sort(TYPE* left, TYPE* right) {
    int h = right - left;
    if (h >= 4) {
        sort5(*left, *(left + 1), *(left + 2), *(left + 3), *(left + 4));
        for (TYPE* pi = left + 5; pi <= right; pi++) {
            TYPE h = *pi;
            TYPE* pj = pi - 1;
            while (IS_LOWER(h, *pj)) {
                *(pj + 1) = *pj;
                pj -= 1;
            }
            *(pj + 1) = h;
        }
    }
    else {
        switch (h) {
        case 3:
            sort4(*left, *(left + 1), *(left + 2), *(left + 3));
            break;
        case 2:
            sort3(*left, *(left + 1), *(left + 2));
            break;
        case 1:
            sort2(*left, *(left + 1));
            break;
        }
    }
}

#define SELSZ 16

void quicksel(TYPE* left, TYPE* right) {

    TYPE _Alignas(64) swap[2 * SELSZ];

    memcpy(swap, left + 1, SELSZ * sizeof(TYPE));
    memcpy(left + 1, right + 1, SELSZ * sizeof(TYPE));

    TYPE* l0 = left - SELSZ;
    TYPE* r0 = left + SELSZ;

    while (1) {
        TYPE piv = *l0;
        TYPE* l = l0 + 1;
        TYPE* r = r0;
        while (1) {
            while (IS_LOWER(*l, piv)) l++;
            while (IS_LOWER(piv, *r)) r--;
            if (l >= r) break;
            swap(*l, *r);
            l++;
            r--;
        }
        *l0 = *r;
        *r = piv;
        if (r == left) break;
        if (r < left) l0 = r + 1;
        else r0 = r - 1;
    }
    memcpy(right + 1, left + 1, SELSZ * sizeof(TYPE));
    memcpy(left + 1, swap, SELSZ * sizeof(TYPE));
}

#define INSERT_SORT_THRE 16
#define SMALLPART 2048

TYPE* partition_small(TYPE* left, TYPE* right) {

    unsigned n = right - left;
    // n > SMALLPART

    TYPE* outerleft = left;
    TYPE* pivp = left + n / 2;

    sort3(*(left + 1), *pivp, *right);
    left += 2;
    right--;

    TYPE piv = *pivp;

    *pivp = *outerleft;

    TYPE _Alignas(64) swap[SMALLPART];
    TYPE* sw = swap;

    TYPE* left0 = left;
    unsigned h;

    while (left <= right - 8) {
        for (int i = 8; i--;) {
            h = IS_LOWER(*left, piv); *left0 = *sw = *left++; left0 += h; sw += !h;
        }
    }
    while (left <= right) {
        h = IS_LOWER(*left, piv); *left0 = *sw = *left++; left0 += h; sw += !h;
    }
    TYPE* mid = left0 - 1;
    TYPE* sw0 = swap;
    while (sw0 < sw) *left0++ = *sw0++;

    *outerleft = *mid;
    *mid = piv;
    return mid;
}

TYPE* partition(TYPE* left, TYPE* right) {

    TYPE* outerleft = left;
    TYPE piv;

    left++;
    left += SELSZ;
    right -= SELSZ;

    TYPE* pivp = left + (right - left) / 2;
    swap(*(left - SELSZ), *pivp);
    quicksel(left, right);

    piv = *left;
    *left = *outerleft;

    #define SWSZ 1024
    TYPE _Alignas(64) swap[SWSZ];

    TYPE* left0 = left;
    TYPE* right0 = right;
    unsigned h;

    TYPE* sw = swap;
    while (sw < swap + SWSZ - 16 && left <= right - 16) {
        for (int i = 16; i--;) {
            h = IS_LOWER(*right, piv); *right0 = *sw = *right--; right0 -= !h; sw += h;
        }
    }
    while (sw < swap + SWSZ - 16 && left <= right) {
        h = IS_LOWER(*right, piv); *right0 = *sw = *right--; right0 -= !h; sw += h;
    }
    while (1) {
        while (right0 > right + 16 && left <= right - 16) {
            for (int i = 16; i--;) {
                h = IS_LOWER(*left, piv); *left0 = *right0 = *left++; left0 += h; right0 -= !h;
            }
        }
        if (left > right - 16) break;

        while (left0 < left - 16 && left <= right - 16) {
            for (int i = 16; i--;) {
                h = IS_LOWER(*right, piv); *right0 = *left0 = *right--; right0 -= !h; left0 += h;
            }
        }
        if (left > right - 16) break;
    }
    while (right0 > right && left <= right) {
        h = IS_LOWER(*left, piv); *left0 = *right0 = *left++; left0 += h; right0 -= !h;
    }
    while (left <= right) {
        h = IS_LOWER(*right, piv); *right0 = *left0 = *right--; right0 -= !h; left0 += h;
    }
    TYPE* sw0 = swap;
    while (sw0 < sw) *left0++ = *sw0++;
    left0--;

    *outerleft = *left0;
    *left0 = piv;
    return left0;
}

double t(void) {
    static double t0;
    struct timeval tv;
    gettimeofday(&tv, NULL);
    double h = t0;
    t0 = tv.tv_sec + tv.tv_usec / 1000000.0;
    return t0 - h;
}

unsigned chksum;

void init(TYPE* data, int len) {
    unsigned chks = 0;
    for (int i = 0; i < len; i++) {
        data[i] = rand() / 128;
        chks += *(unsigned*)&data[i];
    }
    chksum = chks;
}

void test(TYPE* data, int len) {
    unsigned chks = data[0];
    for (int i = 1; i < len; i++) {
        if (data[i] < data[i - 1]) {
            printf("ERROR ORDER\n");
            break;
        }
        chks += *(unsigned*)&data[i];
    }
    if (chks != chksum) printf("ERROR CHKS\n");
}

#define SIZE (50 * 1000000)

TYPE data[SIZE];

int main(void) {

    init(data, SIZE);
    printf("Sorting %d million numbers with Branchless Quicksort ...\n",
        SIZE / 1000000);
    t();
    sort(data, SIZE);
    printf("%.3fs\n", t());
    test(data, SIZE);
    return 0;
}
Sorting 50 million numbers with Branchless Quicksort ...
1.151s

Multi-Threaded Branchless

The algorithm is extended to use multiple CPU cores.

New threads are created only for large partitions and are limited so that we don’t start too many threads. Smaller partitions are processed locally in the current thread.

// SPDX-License-Identifier: MIT
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#include <pthread.h>
#include <unistd.h>
#include <stdatomic.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int
#define swap(a, b) do { __typeof__(a) h = a; a = b; b = h; } while (0)

#define INSERTSORT_THRE 16
#define SMALLPART 2048

TYPE* partition_small(TYPE* left, TYPE* right);
TYPE* partition(TYPE* left, TYPE* right);
void insert_sort(TYPE* left, TYPE* right);
void heap_sort(TYPE* left, TYPE* right);

int max_threads;
atomic_int n_threads;

pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;

void* sort_thr(void *arg) {

    TYPE** stack = (TYPE**)arg;
    int sp = 0;
    TYPE* left = stack[sp];
    TYPE* right = stack[sp + 1];

    while (1) {
        int partsz = right - left;
        if (partsz < INSERTSORT_THRE) {
            insert_sort(left, right);
            goto pop_stack;
        }
        TYPE* l;
        TYPE* r;

        TYPE* mid;
        if (partsz <= SMALLPART) mid = partition_small(left, right);
        else mid = partition(left, right);

        if (partsz > SMALLPART && mid - left < 100) {
            heap_sort(left, right);
            goto pop_stack;
        }

        if (mid < left + partsz / 2) {
            l = mid + 1;
            r = right;
            right = mid - 1;
        }
        else {
            l = left;
            r = mid - 1;
            left = mid + 1;
        }
        // soft limit: may exceed max_threads
        TYPE** stack_new = NULL;
        if (r - l > 1000000 && n_threads < max_threads) {
            // start a new thread - max_threads is a soft limit
            stack_new = malloc(64 * sizeof(TYPE*));
            // stack size 64 is sufficient for n <= 2^32
            if (stack_new) {
                stack_new[0] = l;
                stack_new[1] = r;

                pthread_mutex_lock(&mtx);
                pthread_t thread;
                if (pthread_create(&thread, NULL, sort_thr, stack_new) == 0) {
                    n_threads += 1;
                    pthread_detach(thread);
                }
                else {
                    free(stack_new);
                    stack_new = NULL;
                }
                pthread_mutex_unlock(&mtx);
            }
        }
        if (stack_new == NULL) {
            stack[sp] = l;
            stack[sp + 1] = r;
            sp += 2;
        }
        continue;

    pop_stack:
        if (sp == 0) break;
        sp -= 2;
        left = stack[sp];
        right = stack[sp + 1];
    }

    free(stack);
    pthread_mutex_lock(&mtx);
    n_threads -= 1;
    if (n_threads == 0) pthread_cond_signal(&cond);
    pthread_mutex_unlock(&mtx);

    return NULL;
}

void sort(TYPE* data, int len) {

    int n_cpus = sysconf(_SC_NPROCESSORS_ONLN);
    // printf("CPUs: %d\n", n_cpus);
    if (n_cpus > 0) max_threads = n_cpus * 2;
    else max_threads = 8;

    for (TYPE* p = data + 1; p < data + len; p++) {
        if (*p < *data) swap(*p, *data);
    }
    pthread_t thread;
    TYPE** stack = malloc(64 * sizeof(TYPE*));
    stack[0] = data + 1;
    stack[1] = data + len - 1;
    n_threads = 1;
    pthread_create(&thread, NULL, sort_thr, stack);

    pthread_mutex_lock(&mtx);
    while (n_threads != 0) pthread_cond_wait(&cond, &mtx);
    pthread_mutex_unlock(&mtx);
}

void heap_sort(TYPE* left, TYPE* right) {
    long n = right - left + 1;
    if (n < 2) return;
    for (long i = n / 2, j, k; ; ) {
        if (i > 0) {
            k = left[--i];
        }
        else {
            n -= 1;
            if (n == 0) return;
            k = left[n];
            left[n] = left[0];
        }
        j = i;
        while (j * 2 + 1 < n) {
            long child = j * 2 + 1;
            if (child + 1 < n && IS_LOWER(left[child], left[child + 1])) child++;
            if (!IS_LOWER(k, left[child])) break;
            left[j] = left[child];
            j = child;
        }
        left[j] = k;
    }
}

#define sort2(a, b) do { \
    if (IS_LOWER(b, a)) swap(a, b); \
} while(0)              \

#define sort3(a, b, c) do { \
    sort2(a, b);        \
    sort2(b, c);        \
    sort2(a, b);        \
} while(0)              \

#define sort4(a, b, c, d) do { \
    sort2(a, b);        \
    sort2(c, d);        \
    sort2(a, c);        \
    sort2(b, d);        \
    sort2(b, c);        \
} while(0)              \

#define sort5(a, b, c, d, e) do { \
    sort2(b, c);        \
    sort2(d, e);        \
    sort2(b, d);        \
    sort2(a, c);        \
    sort2(a, d);        \
    sort2(c, e);        \
    sort2(a, b);        \
    sort2(c, d);        \
    sort2(b, c);        \
} while(0)              \

void insert_sort(TYPE* left, TYPE* right) {
    int h = right - left;
    if (h >= 4) {
        sort5(*left, *(left + 1), *(left + 2), *(left + 3), *(left + 4));
        for (TYPE* pi = left + 5; pi <= right; pi++) {
            TYPE h = *pi;
            TYPE* pj = pi - 1;
            while (IS_LOWER(h, *pj)) {
                *(pj + 1) = *pj;
                pj -= 1;
            }
            *(pj + 1) = h;
        }
    }
    else {
        switch (h) {
        case 3:
            sort4(*left, *(left + 1), *(left + 2), *(left + 3));
            break;
        case 2:
            sort3(*left, *(left + 1), *(left + 2));
            break;
        case 1:
            sort2(*left, *(left + 1));
            break;
        }
    }
}

#define SELSZ 16

void quicksel(TYPE* left, TYPE* right) {

    TYPE _Alignas(64) swap[2 * SELSZ];

    memcpy(swap, left + 1, SELSZ * sizeof(TYPE));
    memcpy(left + 1, right + 1, SELSZ * sizeof(TYPE));

    TYPE* l0 = left - SELSZ;
    TYPE* r0 = left + SELSZ;

    while (1) {
        TYPE piv = *l0;
        TYPE* l = l0 + 1;
        TYPE* r = r0;
        while (1) {
            while (IS_LOWER(*l, piv)) l++;
            while (IS_LOWER(piv, *r)) r--;
            if (l >= r) break;
            swap(*l, *r);
            l++;
            r--;
        }
        *l0 = *r;
        *r = piv;
        if (r == left) break;
        if (r < left) l0 = r + 1;
        else r0 = r - 1;
    }
    memcpy(right + 1, left + 1, SELSZ * sizeof(TYPE));
    memcpy(left + 1, swap, SELSZ * sizeof(TYPE));
}

TYPE* partition_small(TYPE* left, TYPE* right) {

    unsigned n = right - left;

    TYPE* outerleft = left;
    TYPE* pivp = left + n / 2;

    sort3(*(left + 1), *pivp, *right);
    left += 2;
    right--;

    TYPE piv = *pivp;

    *pivp = *outerleft;

    TYPE _Alignas(64) swap[SMALLPART];
    TYPE* sw = swap;

    TYPE* left0 = left;
    unsigned h;

    while (left <= right - 8) {
        for (int i = 8; i--;) {
            h = IS_LOWER(*left, piv); *left0 = *sw = *left++; left0 += h; sw += !h;
        }
    }
    while (left <= right) {
        h = IS_LOWER(*left, piv); *left0 = *sw = *left++; left0 += h; sw += !h;
    }
    TYPE* mid = left0 - 1;
    TYPE* sw0 = swap;
    while (sw0 < sw) *left0++ = *sw0++;

    *outerleft = *mid;
    *mid = piv;
    return mid;
}

TYPE* partition(TYPE* left, TYPE* right) {

    unsigned n = right - left;
    // n > SMALLPART

    TYPE* outerleft = left;
    TYPE* pivp = left + n / 2;
    TYPE piv;

    left++;
    swap(*left, *pivp);
    left += SELSZ;
    right -= SELSZ;
    quicksel(left, right);

    piv = *left;
    *left = *outerleft;

    #define SWSZ 1024
    TYPE _Alignas(64) swap[SWSZ];

    TYPE* left0 = left;
    TYPE* right0 = right;
    unsigned h;

    TYPE* sw = swap;
    while (sw < swap + SWSZ - 16 && left <= right - 16) {
        for (int i = 16; i--;) {
            h = IS_LOWER(*right, piv); *right0 = *sw = *right--; right0 -= !h; sw += h;
        }
    }
    while (sw < swap + SWSZ - 16 && left <= right) {
        h = IS_LOWER(*right, piv); *right0 = *sw = *right--; right0 -= !h; sw += h;
    }
    while (1) {
        while (right0 > right + 16 && left <= right - 16) {
            for (int i = 16; i--;) {
                h = IS_LOWER(*left, piv); *left0 = *right0 = *left++; left0 += h; right0 -= !h;
            }
        }
        if (left > right - 16) break;

        while (left0 < left - 16 && left <= right - 16) {
            for (int i = 16; i--;) {
                h = IS_LOWER(*right, piv); *right0 = *left0 = *right--; right0 -= !h; left0 += h;
            }
        }
        if (left > right - 16) break;
    }
    while (right0 > right && left <= right) {
        h = IS_LOWER(*left, piv); *left0 = *right0 = *left++; left0 += h; right0 -= !h;
    }
    while (left <= right) {
        h = IS_LOWER(*right, piv); *right0 = *left0 = *right--; right0 -= !h; left0 += h;
    }
    TYPE* sw0 = swap;
    while (sw0 < sw) *left0++ = *sw0++;
    left0--;

    *outerleft = *left0;
    *left0 = piv;
    return left0;
}


double t(void) {
    static double t0;
    struct timeval tv;
    gettimeofday(&tv, NULL);
    double h = t0;
    t0 = tv.tv_sec + tv.tv_usec / 1000000.0;
    return t0 - h;
}

unsigned chksum;

void init(TYPE* data, int len) {
    unsigned chks = 0;
    for (int i = 0; i < len; i++) {
        data[i] = rand() / 128;
        chks += *(unsigned*)&data[i];
    }
    chksum = chks;
}

void test(TYPE* data, int len) {
    unsigned chks = data[0];
    for (int i = 1; i < len; i++) {
        if (data[i] < data[i - 1]) {
            printf("ERROR ORDER\n");
            break;
        }
        chks += *(unsigned*)&data[i];
    }
    if (chks != chksum) printf("ERROR CHKS\n");
}

#define SIZE (50 * 1000000)

TYPE data[SIZE];

int main(void) {

    init(data, SIZE);
    printf("Sorting %d million numbers with Threaded Branchless Quicksort ...\n",
        SIZE / 1000000);
    t();
    sort(data, SIZE);
    printf("%.3fs\n", t());
    test(data, SIZE);
    return 0;
}
Sorting 50 million numbers with Threaded Branchless Quicksort ...
0.275s
Multithreading and the avoidance of branch mispredictions can significantly improve the application performance of programs.


Interactive sorting demo