Fast Branchless Quicksort using Sorting-Networks with C++ Interface

On modern CPUs, avoiding branch misprediction is a key technique to speed up programs: When ‘if’ slows you down, avoid it.

Performance results naturally depend on the underlying hardware. The following benchmarks show the execution times for sorting 50 million doubles using different sorting implementations. The measurements were taken on an Apple M1 system and on an AMD Ryzen 3 system with the -O3 compiler option.

Implementation Apple M1 AMD Ryzen
std::sort 1.33s 5.56s
pdqsort 1.33s 2.81s
blqsort 1.02s 2.06s

blqsort

Full source code is included on the page in scrollable blocks.

This paper by Edelkamp and A. Weiß shows how partitioning performance in Quicksort can be improved by avoiding conditional branches.

The strategy of using an auxiliary buffer for branchless partitioning is inspired by fluxsort. The “auxiliary buffer” here means a 512‑element stack array, not heap memory.

To avoid the O(n²) runtime caused by bad input data, the program can group identical elements together and switch to heapsort for that specific part if it detects a big imbalance during partitioning. The program also checks if a partition is already sorted.

For larger parts, it uses a median-of-medians strategy to find a good pivot. In addition, critical partitioning loops are explicitly unrolled.

For 2 to 12 elements, the algorithm uses custom sorting networks. This approach requires a separate code path for each size but sorts small subsets with very few swaps using a branchless sort‑2 primitive. Source for sorting networks

As a result, blqsort becomes faster than, for example, std::sort and pdqsort for random data.

blqs.h

// SPDX-License-Identifier: MIT
// blqs.h - Branchless Quicksort
// (c) 2026 christof.kaser@gmail.com

#ifndef BLQS_H
#define BLQS_H

#include <cstdint>
#include <cstring>
#include <stddef.h>
#include <functional>

namespace blqs {

template<typename T, typename Compare>
struct Networks {
    static inline void sort2(T& a, T& b, Compare comp) {
        T x = a; T y = b;
        bool m = comp(x, y);
        a = m ? x : y; b = m ? y : x;
    }
    static inline void sort3(T& a, T& b, T& c, Compare comp) {
        sort2(a, b, comp); sort2(b, c, comp); sort2(a, b, comp);
    }
    static inline void sort4(T& a, T& b, T& c, T& d, Compare comp) {
        sort2(a, b, comp); sort2(c, d, comp); sort2(a, c, comp);
        sort2(b, d, comp); sort2(b, c, comp);
    }
    static inline void sort5(T& a, T& b, T& c, T& d, T& e, Compare comp) {
        sort2(b, c, comp); sort2(d, e, comp); sort2(b, d, comp);
        sort2(a, c, comp); sort2(a, d, comp); sort2(c, e, comp);
        sort2(a, b, comp); sort2(c, d, comp); sort2(b, c, comp);
    }
    static inline void sort6(T& a, T& b, T& c, T& d, T& e, T& f, Compare comp) {
        sort2(a, b, comp); sort2(c, d, comp); sort2(e, f, comp);
        sort2(a, c, comp); sort2(b, d, comp); sort2(e, f, comp);
        sort2(a, e, comp); sort2(b, f, comp); sort2(c, e, comp);
        sort2(d, f, comp); sort2(b, c, comp); sort2(d, e, comp);
        sort2(c, d, comp);
    }
    static inline void sort7(T& a, T& b, T& c, T& d, T& e, T& f, T& g, Compare comp) {
        sort2(a, g, comp); sort2(c, d, comp); sort2(e, f, comp);
        sort2(a, c, comp); sort2(b, e, comp); sort2(d, g, comp);
        sort2(a, b, comp); sort2(c, f, comp); sort2(d, e, comp);
        sort2(b, c, comp); sort2(e, g, comp);
        sort2(c, d, comp); sort2(e, f, comp);
        sort2(b, c, comp); sort2(d, e, comp); sort2(f, g, comp);
    }
    static inline void sort8(T& a, T& b, T& c, T& d, T& e, T& f, T& g, T& h, Compare comp) {
        sort2(a,b,comp); sort2(c,d,comp); sort2(e,f,comp); sort2(g,h,comp);
        sort2(a,c,comp); sort2(b,d,comp); sort2(e,g,comp); sort2(f,h,comp);
        sort2(b,c,comp); sort2(f,g,comp);
        sort2(a,e,comp); sort2(b,f,comp); sort2(c,g,comp); sort2(d,h,comp);
        sort2(c,e,comp); sort2(d,f,comp);
        sort2(b,c,comp); sort2(d,e,comp); sort2(f,g,comp);
    }
    static inline void sort9(T& a, T& b, T& c, T& d, T& e, T& f, T& g, T& h, T& i, Compare comp) {
        sort2(a,d,comp); sort2(b,h,comp); sort2(c,f,comp); sort2(e,i,comp);
        sort2(a,h,comp); sort2(c,e,comp); sort2(d,i,comp); sort2(f,g,comp);
        sort2(a,c,comp); sort2(b,d,comp); sort2(e,f,comp); sort2(h,i,comp);
        sort2(b,e,comp); sort2(d,g,comp); sort2(f,h,comp);
        sort2(a,b,comp); sort2(c,e,comp); sort2(d,f,comp); sort2(g,i,comp);
        sort2(c,d,comp); sort2(e,f,comp); sort2(g,h,comp);
        sort2(b,c,comp); sort2(d,e,comp); sort2(f,g,comp);
    }
    static inline void sort10(T& a, T& b, T& c, T& d, T& e, T& f, T& g, T& h, T& i, T& j, Compare comp) {
        sort2(a,b,comp); sort2(c,f,comp); sort2(d,g,comp); sort2(e,h,comp); sort2(i,j,comp);
        sort2(a,g,comp); sort2(b,i,comp); sort2(c,e,comp); sort2(d,j,comp); sort2(f,h,comp);
        sort2(a,c,comp); sort2(b,d,comp); sort2(e,f,comp); sort2(g,i,comp); sort2(h,j,comp);
        sort2(a,b,comp); sort2(c,h,comp); sort2(d,f,comp); sort2(e,g,comp); sort2(i,j,comp);
        sort2(b,c,comp); sort2(d,e,comp); sort2(f,g,comp); sort2(h,i,comp);
        sort2(b,d,comp); sort2(c,e,comp); sort2(f,h,comp); sort2(g,i,comp);
        sort2(c,d,comp); sort2(e,f,comp); sort2(g,h,comp);
    }
    static inline void sort11(T& a, T& b, T& c, T& d, T& e, T& f, T& g, T& h, T& i, T& j, T& k, Compare comp) {
        sort2(a,j,comp); sort2(b,g,comp); sort2(c,e,comp); sort2(d,h,comp); sort2(f,i,comp);
        sort2(a,b,comp); sort2(d,f,comp); sort2(e,k,comp); sort2(g,j,comp); sort2(h,i,comp);
        sort2(b,d,comp); sort2(c,f,comp); sort2(e,h,comp); sort2(i,k,comp);
        sort2(a,e,comp); sort2(b,c,comp); sort2(d,h,comp); sort2(f,j,comp); sort2(g,i,comp);
        sort2(a,b,comp); sort2(c,g,comp); sort2(e,f,comp); sort2(h,i,comp); sort2(j,k,comp);
        sort2(c,e,comp); sort2(d,g,comp); sort2(f,h,comp); sort2(i,j,comp);
        sort2(b,c,comp); sort2(d,e,comp); sort2(f,g,comp); sort2(h,i,comp);
        sort2(c,d,comp); sort2(e,f,comp); sort2(g,h,comp);
    }
    static inline void sort12(T& a, T& b, T& c, T& d, T& e, T& f, T& g, T& h, T& i, T& j, T& k, T& l, Compare comp) {
        sort2(a,i,comp); sort2(b,h,comp); sort2(c,g,comp); sort2(d,l,comp); sort2(e,k,comp); sort2(f,j,comp);
        sort2(a,c,comp); sort2(b,e,comp); sort2(d,f,comp); sort2(g,i,comp); sort2(h,k,comp); sort2(j,l,comp);
        sort2(a,b,comp); sort2(c,j,comp); sort2(e,h,comp); sort2(f,g,comp); sort2(k,l,comp);
        sort2(b,d,comp); sort2(c,h,comp); sort2(e,j,comp); sort2(i,k,comp);
        sort2(a,b,comp); sort2(c,d,comp); sort2(e,f,comp); sort2(g,h,comp); sort2(i,j,comp); sort2(k,l,comp);
        sort2(b,c,comp); sort2(d,f,comp); sort2(g,i,comp); sort2(j,k,comp);
        sort2(c,e,comp); sort2(d,g,comp); sort2(f,i,comp); sort2(h,j,comp);
        sort2(b,c,comp); sort2(d,e,comp); sort2(f,g,comp); sort2(h,i,comp); sort2(j,k,comp);
    }
};

template<typename T, typename Compare>
void heap_sort(T* left, T* right, Compare comp) {
    long n = right - left + 1;
    if (n < 2) return;
    for (long i = n / 2; ; ) {
        T k;
        if (i > 0) k = left[--i];
        else {
            n -= 1; if (n == 0) return;
            k = left[n]; left[n] = left[0];
        }
        long j = i;
        while (j * 2 + 1 < n) {
            long child = j * 2 + 1;
            if (child + 1 < n && comp(left[child], left[child + 1])) child++;
            if (!comp(k, left[child])) break;
            left[j] = left[child]; j = child;
        }
        left[j] = k;
    }
}

template<typename T, typename Compare>
class Sorter {
    static constexpr int SMALLPART = 256;
    static constexpr int SWSZ =  512;
    static constexpr int UNROLL = 16;

    static inline void med5(T& a, T& b, T& c, T& d, T& e, Compare comp) {
        Networks<T, Compare>::sort2(a, b, comp); Networks<T, Compare>::sort2(c, d, comp);
        Networks<T, Compare>::sort2(a, c, comp); Networks<T, Compare>::sort2(b, d, comp);
        Networks<T, Compare>::sort2(b, c, comp); Networks<T, Compare>::sort2(c, e, comp);
        Networks<T, Compare>::sort2(b, c, comp);
    }

    static void sorting_network(T* l, int partsz_min_1, Compare comp) {
        switch (partsz_min_1) {
            case 11: Networks<T, Compare>::sort12(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8],l[9],l[10],l[11],comp); break;
            case 10: Networks<T, Compare>::sort11(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8],l[9],l[10],comp); break;
            case 9: Networks<T, Compare>::sort10(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8],l[9],comp); break;
            case 8: Networks<T, Compare>::sort9(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8],comp); break;
            case 7: Networks<T, Compare>::sort8(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],comp); break;
            case 6: Networks<T, Compare>::sort7(l[0],l[1],l[2],l[3],l[4],l[5],l[6],comp); break;
            case 5: Networks<T, Compare>::sort6(l[0],l[1],l[2],l[3],l[4],l[5],comp); break;
            case 4: Networks<T, Compare>::sort5(l[0],l[1],l[2],l[3],l[4],comp); break;
            case 3: Networks<T, Compare>::sort4(l[0],l[1],l[2],l[3],comp); break;
            case 2: Networks<T, Compare>::sort3(l[0],l[1],l[2],comp); break;
            case 1: Networks<T, Compare>::sort2(l[0], l[1], comp); break;
            default: break;
        }
    }

    static T* partition_small(T* __restrict__ left, T* __restrict__ right, Compare comp) {
        T* outerleft = left;
        T* pivp = left + (right - left) / 2;
        med5(left[1], left[2], *pivp, right[-1], *right, comp);
        left += 3; right -= 2;
        T piv = *pivp; *pivp = *outerleft;

        T swbuf[SMALLPART];
        T *sw = swbuf, *lwr = left;
        while (left <= right) {
            bool h = comp(*left, piv);
            *lwr = *sw = *left++;
            lwr += h; sw += !h;
        }
        std::move(swbuf, sw, lwr);
        lwr -= 1; *outerleft = *lwr; *lwr = piv;
        return lwr;
    }

    static T* partition_large(T* __restrict__ left, T* __restrict__ right, Compare comp) {
        T* outerleft = left;
        T* pivp = left + (right - left) / 2;

        med5(left[1], left[2], left[3], left[4], left[5], comp);
        med5(left[21], left[22], left[23], left[24], left[25], comp);
        med5(pivp[-2], pivp[-1], pivp[0], pivp[1], pivp[2], comp);
        med5(right[-14], right[-13], right[-12], right[-11], right[-10], comp);
        med5(right[-4], right[-3], right[-2], right[-1], right[0], comp);
        med5(left[3], left[23], pivp[0], right[-12], right[-2], comp);

        left += 1;
        T piv = *pivp; *pivp = *outerleft;

        while (comp(*left, piv)) left++;
        if (left >= outerleft + 8) {
            // could already be sorted
            *pivp = piv;
            T* p = outerleft + 1;
            while (p <= right) {
                if (comp(*p, *(p - 1))) {
                    *pivp = *outerleft;
                    goto not_sorted;
                }
                p += 1;
            }
            return NULL;
        }
    not_sorted:

        T swbuf[SWSZ];
        T *lwr = left, *rwr = right, *sw = swbuf;

        while (sw < swbuf + SWSZ - UNROLL && left <= right - UNROLL) {
            for (int i = UNROLL; i--;) {
                bool h = comp(*right, piv); *rwr = *sw = *right--; rwr -= !h; sw += h;
            }
        }
        while (sw < swbuf + SWSZ - UNROLL && left <= right) {
            bool h = comp(*right, piv); *rwr = *sw = *right--; rwr -= !h; sw += h;
        }
        while (left <= right - UNROLL) {
            while (rwr > right + UNROLL && left <= right - UNROLL) {
                for (int i = UNROLL; i--;) {
                    bool h = comp(*left, piv); *lwr = *rwr = *left++; lwr += h; rwr -= !h;
                }
            }
            while (lwr < left - UNROLL && left <= right - UNROLL) {
                for (int i = UNROLL; i--;) {
                    bool h = comp(*right, piv); *rwr = *lwr = *right--; rwr -= !h; lwr += h;
                }
            }
        }
        while (rwr > right && left <= right) {
            bool h = comp(*left, piv); *lwr = *rwr = *left++; lwr += h; rwr -= !h;
        }
        while (left <= right) {
            bool h = comp(*right, piv); *rwr = *lwr = *right--; rwr -= !h; lwr += h;
        }
        std::move(swbuf, sw, lwr);
        *outerleft = *rwr; *rwr = piv;
        return rwr;
    }

public:
    static void sort(T* data, T* data_end, Compare comp) {
        if (data_end - data < 2) return;
        T *left = data, *right = data_end - 1;
        T* stack[80]; unsigned sp = 0;

        while (true) {
            ptrdiff_t partsz = right - left;
            T* mid;
            if (partsz <= SMALLPART) {
                if (partsz <= 11) {
                    sorting_network(left, (int)partsz, comp);
                    goto pop_stack;
                }
                mid = partition_small(left, right, comp);
            }
            else {
                mid = partition_large(left, right, comp);
                if (mid == NULL) goto pop_stack;

                if ((mid - left) * 16 < partsz || (right - mid) * 16 < partsz) {
                    heap_sort(left, mid - 1, comp);

                    // gather pivot duplicates at the boundary
                    T piv = *mid;
                    mid += 1;
                    for (T* p = mid; p <= right; p++) {
                        if (!comp(piv, *p)) {
                            std::swap(*mid, *p);
                            mid++;
                        }
                    }
                    heap_sort(mid, right, comp);

                    goto pop_stack;
                }
            }


            if (mid < left + partsz / 2) {
                stack[sp++] = mid + 1; stack[sp++] = right;
                right = mid - 1;
            }
            else {
                stack[sp++] = left; stack[sp++] = mid - 1;
                left = mid + 1;
            }
            continue;

        pop_stack:
            if (sp == 0) break;
            right = stack[--sp]; left = stack[--sp];
        }
    }
};
template<typename T, typename Compare>
void sort(T* first, T* last, Compare comp) {
    Sorter<T, Compare>::sort(first, last, comp);
}
template<typename T>
void sort(T* first, T* last) {
    sort(first, last, std::less<T>());
}
}
#endif
You only need to include this single header file, and it can be used just as easily as std::sort.

#include <cstdio>
#include <cstdlib>
#include <ctime>

#include "blqs.h"

constexpr int SIZE = 50000000;

double data[SIZE];

double cputime() {
    return (double)clock() / CLOCKS_PER_SEC;
}
int main() {
    double t0;
    printf("blqs::sort - sorting %d million doubles ...\n", SIZE / 1000000);

    for (int i = 0; i < SIZE; i++) data[i] = rand() / 1024.0;
    t0 = cputime();
    blqs::sort(data, data + SIZE);
    printf("Random: %.2fs\n", cputime() - t0);

    t0 = cputime();
    blqs::sort(data, data + SIZE);
    printf("Sorted: %.2fs\n", cputime() - t0);

    for (int i = 0; i < 10; i++) std::swap(data[rand() % (SIZE / 10)], data[rand() % (SIZE / 10)]);
    t0 = cputime();
    blqs::sort(data, data + SIZE);
    printf("Nearly sorted: %.2fs\n", cputime() - t0);

    for (int i = 0; i < SIZE; i++) data[i] = rand() % 1000;
    t0 = cputime();
    blqs::sort(data, data + SIZE);
    printf("Duplicates: %.2fs\n", cputime() - t0);
}
blqsort - sorting 50 million doubles ...
Random: 1.02s
Sorted: 0.03s
Nearly sorted: 0.13s
Duplicates: 0.52s
With std::sort and pdqsort (which interestingly has almost exactly the same runtimes), it looks like this:

std::sort - sorting 50 million doubles ...
Random: 1.33s
Sorted: 0.05s
Nearly sorted: 0.07s
Duplicates: 0.28s

Sorting Custom Data Structures

In practice, we often need to sort custom data structures. This is where SIMD libraries like Google Highway - while very fast for simple numbers - become difficult to use.

Using std::sort or blqs::sort gives you much more flexibility;

#include <cstdio>
#include <cstdlib>
#include <ctime>

#include "blqs.h"

constexpr int SIZE = 50000000;

struct entry {
    int32_t id;
    int32_t value;
    bool operator<(const entry& other) const {
        return id < other.id;
    }
};

struct entry data[SIZE];

double cputime() {
    return (double)clock() / CLOCKS_PER_SEC;
}
int main() {
    double t0;
    printf("blqs::sort - sorting %d million structs ...\n", SIZE / 1000000);
    for (int i = 0; i < SIZE; i++) data[i].id = rand();
    t0 = cputime();
    blqs::sort(data, data + SIZE);
    printf("Time: %.2fs\n", cputime() - t0);
}
Execution times for sorting 50 million of this structs.

Implementation Apple M1 AMD Ryzen
std::sort 3.46s 4.75s
pdqsort 3.46s 4.72s
blqsort 0.97s 2.20s

Branchless Quicksort in C

Multi-Threaded Sorting in C

Interactive Sorting Demo


christof.kaser@gmail.com