Multithreading has become increasingly important with stagnating single‑thread performance and widespread multicore CPUs. The challenge is to divide the work across multiple threads in a way that actually improves performance.
This is not possible for every algorithm. However, Quicksort is a classic divide‑and‑conquer algorithm, which makes it well suited for parallelization, since independent subproblems can be processed concurrently.
Avoiding branch misprediction is another way to speed up programs. One effective way to reduce branch misprediction is to eliminate the branches altogether.
for (int i = 0, j = 0; i < 1000; i++) {
if (numbers[i] < 500) {
small_numbers[j] = numbers[i];
j += 1;
}
}
If the array contains random numbers, the branch outcome becomes difficult for the predictor to learn, and the misprediction rate is typically high, often approaching 50%.
This can be avoided by eliminating the branching.
for (int i = 0, j = 0; i < 1000; i++) {
small_numbers[j] = numbers[i];
j += (numbers[i] < 500);
}
Performance results naturally depend on the hardware. The following measurements were taken on an M1 MacBook Air.
The following implementations can be pushed into an O(n²) runtime by intentionally chosen inputs. This can be avoided by shuffling the data before sorting.
This is a simple iterative quicksort written in C using a Lomuto‑style partition.
The pivot is placed directly into its final position during partitioning, and the algorithm always continues with the larger part while storing the smaller one on a manual stack.
Small ranges are finished using insertion sort. A global sentinel simplifies insertion sort by eliminating explicit boundary checks.
// SPDX-License-Identifier: MIT
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int
#define INSERTSORT_THRE 32
#define swap(a, b) do { __typeof__(a) _h = a; a = b; b = _h; } while (0)
TYPE* partition(TYPE* left, TYPE* right);
void insert_sort(TYPE* left, TYPE* right);
void sort(TYPE* data, int len) {
for (TYPE* p = data + 1; p < data + len; p++) {
if (*p < *data) swap(*p, *data);
}
TYPE* stack[64]; // for max 2^32 items
unsigned sp = 0;
TYPE* left = data + 1;
TYPE* right = data + len - 1;
while (1) {
long partsz = right - left;
if (partsz < INSERTSORT_THRE) {
insert_sort(left, right);
if (sp == 0) break;
sp -= 2;
left = stack[sp];
right = stack[sp + 1];
}
else {
TYPE* mid;
mid = partition(left, right);
if (mid < left + partsz / 2) {
stack[sp] = mid + 1;
stack[sp + 1] = right;
right = mid - 1;
}
else {
stack[sp] = left;
stack[sp + 1] = mid - 1;
left = mid + 1;
}
sp += 2;
}
}
}
void insert_sort(TYPE* left, TYPE* right) {
for (TYPE* pi = left + 1; pi <= right; pi++) {
TYPE h = *pi;
TYPE* pj = pi - 1;
while (IS_LOWER(h, *pj)) {
*(pj + 1) = *pj;
pj -= 1;
}
*(pj + 1) = h;
}
}
TYPE* partition(TYPE* left, TYPE* right)
{
TYPE* outerleft = left;
TYPE* pivp = left + (right - left) / 2;
TYPE piv = *pivp;
*pivp = *outerleft;
TYPE* store = left;
for (TYPE* p = left + 1; p <= right; p++) {
if (*p < piv) {
store++;
swap(*store, *p);
}
}
*outerleft = *store;
*store = piv;
return store;
}
double t(void) {
static double t0;
struct timeval tv;
gettimeofday(&tv, NULL);
double h = t0;
t0 = tv.tv_sec + tv.tv_usec / 1000000.0;
return t0 - h;
}
unsigned chksum;
void init(TYPE* data, int len) {
chksum = 0;
for (int i = 0; i < len; i++) {
data[i] = rand() / 128;
chksum += *(unsigned*)&data[i];
}
}
void test(TYPE* data, int len) {
unsigned chks = data[0];
for (int i = 1; i < len; i++) {
if (data[i] < data[i - 1]) {
printf("ERROR ORDER\n");
break;
}
chks += *(unsigned*)&data[i];
}
if (chks != chksum) printf("ERROR CHKS\n");
}
#define SIZE (50 * 1000000)
TYPE data[SIZE];
int main(void) {
init(data, SIZE);
printf("Sorting %d million numbers with Quicksort ...\n",
SIZE / 1000000);
t();
sort(data, SIZE);
printf("%.3fs\n", t());
test(data, SIZE);
return 0;
}
Sorting 50 million numbers with Quicksort ...
3.350s
For comparison C++ std::sort:
.
.
#include <algorithm>
void sort(int* data, int len) {
std::sort(data, data + len);
}
.
.
Sorting 50 million numbers with std::sort ...
1.190s
The paper https://arxiv.org/abs/1604.06697 by Edelkamp and A. Weiß shows how partitioning performance in Quicksort can be improved by avoiding conditional branches.
The idea of using auxiliary memory for branchless partitioning is inspired by https://github.com/scandum/fluxsort .
The pivot is chosen to avoid bad cases, and small parts are sorted using insertion sort and small sorting networks.
To avoid the O(n²) trap with duplicates, a simple heuristic detects extreme partition imbalance and the segment switches to Heapsort.
// SPDX-License-Identifier: MIT
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int
#define swap(a, b) do { __typeof__(a) _h = a; a = b; b = _h; } while (0)
#define INSERTSORT_THRE 16
#define SMALLPART 2048
TYPE* partition_small(TYPE* left, TYPE* right);
TYPE* partition(TYPE* left, TYPE* right);
void insert_sort(TYPE* left, TYPE* right);
void heap_sort(TYPE* left, TYPE* right);
void sort(TYPE* data, int len) {
for (TYPE* p = data + 1; p < data + len; p++) {
if (*p < *data) swap(*p, *data);
}
TYPE* left = data + 1;
TYPE* stack[64]; // for max 2^32 items
unsigned sp = 0;
TYPE* right = data + len - 1;
while (1) {
long partsz = right - left;
if (partsz < INSERTSORT_THRE) {
insert_sort(left, right);
goto pop_stack;
}
TYPE* mid;
if (partsz <= SMALLPART) mid = partition_small(left, right);
else mid = partition(left, right);
if (partsz > SMALLPART && mid - left < 100) {
heap_sort(left, right);
goto pop_stack;
}
if (mid < left + partsz / 2) {
stack[sp] = mid + 1;
stack[sp + 1] = right;
right = mid - 1;
}
else {
stack[sp] = left;
stack[sp + 1] = mid - 1;
left = mid + 1;
}
sp += 2;
continue;
pop_stack:
if (sp == 0) break;
sp -= 2;
left = stack[sp];
right = stack[sp + 1];
}
}
void heap_sort(TYPE* left, TYPE* right) {
long n = right - left + 1;
if (n < 2) return;
for (long i = n / 2, j, k; ; ) {
if (i > 0) {
k = left[--i];
}
else {
n -= 1;
if (n == 0) return;
k = left[n];
left[n] = left[0];
}
j = i;
while (j * 2 + 1 < n) {
long child = j * 2 + 1;
if (child + 1 < n && IS_LOWER(left[child], left[child + 1])) child++;
if (!IS_LOWER(k, left[child])) break;
left[j] = left[child];
j = child;
}
left[j] = k;
}
}
#define sort2(a, b) do { \
if (IS_LOWER(b, a)) swap(a, b); \
} while(0) \
#define sort3(a, b, c) do { \
sort2(a, b); \
sort2(b, c); \
sort2(a, b); \
} while(0) \
#define sort4(a, b, c, d) do { \
sort2(a, b); \
sort2(c, d); \
sort2(a, c); \
sort2(b, d); \
sort2(b, c); \
} while(0) \
#define sort5(a, b, c, d, e) do { \
sort2(b, c); \
sort2(d, e); \
sort2(b, d); \
sort2(a, c); \
sort2(a, d); \
sort2(c, e); \
sort2(a, b); \
sort2(c, d); \
sort2(b, c); \
} while(0) \
void insert_sort(TYPE* left, TYPE* right) {
int h = right - left;
if (h >= 4) {
sort5(*left, *(left + 1), *(left + 2), *(left + 3), *(left + 4));
for (TYPE* pi = left + 5; pi <= right; pi++) {
TYPE h = *pi;
TYPE* pj = pi - 1;
while (IS_LOWER(h, *pj)) {
*(pj + 1) = *pj;
pj -= 1;
}
*(pj + 1) = h;
}
}
else {
switch (h) {
case 3:
sort4(*left, *(left + 1), *(left + 2), *(left + 3));
break;
case 2:
sort3(*left, *(left + 1), *(left + 2));
break;
case 1:
sort2(*left, *(left + 1));
break;
}
}
}
#define SELSZ 16
void quicksel(TYPE* left, TYPE* right) {
TYPE _Alignas(64) swap[2 * SELSZ];
memcpy(swap, left + 1, SELSZ * sizeof(TYPE));
memcpy(left + 1, right + 1, SELSZ * sizeof(TYPE));
TYPE* l0 = left - SELSZ;
TYPE* r0 = left + SELSZ;
while (1) {
TYPE piv = *l0;
TYPE* l = l0 + 1;
TYPE* r = r0;
while (1) {
while (IS_LOWER(*l, piv)) l++;
while (IS_LOWER(piv, *r)) r--;
if (l >= r) break;
swap(*l, *r);
l++;
r--;
}
*l0 = *r;
*r = piv;
if (r == left) break;
if (r < left) l0 = r + 1;
else r0 = r - 1;
}
memcpy(right + 1, left + 1, SELSZ * sizeof(TYPE));
memcpy(left + 1, swap, SELSZ * sizeof(TYPE));
}
#define INSERT_SORT_THRE 16
#define SMALLPART 2048
TYPE* partition_small(TYPE* left, TYPE* right) {
unsigned n = right - left;
// n > SMALLPART
TYPE* outerleft = left;
TYPE* pivp = left + n / 2;
sort3(*(left + 1), *pivp, *right);
left += 2;
right--;
TYPE piv = *pivp;
*pivp = *outerleft;
TYPE _Alignas(64) swap[SMALLPART];
TYPE* sw = swap;
TYPE* left0 = left;
unsigned h;
while (left <= right - 8) {
for (int i = 8; i--;) {
h = IS_LOWER(*left, piv); *left0 = *sw = *left++; left0 += h; sw += !h;
}
}
while (left <= right) {
h = IS_LOWER(*left, piv); *left0 = *sw = *left++; left0 += h; sw += !h;
}
TYPE* mid = left0 - 1;
TYPE* sw0 = swap;
while (sw0 < sw) *left0++ = *sw0++;
*outerleft = *mid;
*mid = piv;
return mid;
}
TYPE* partition(TYPE* left, TYPE* right) {
TYPE* outerleft = left;
TYPE piv;
left++;
left += SELSZ;
right -= SELSZ;
TYPE* pivp = left + (right - left) / 2;
swap(*(left - SELSZ), *pivp);
quicksel(left, right);
piv = *left;
*left = *outerleft;
#define SWSZ 1024
TYPE _Alignas(64) swap[SWSZ];
TYPE* left0 = left;
TYPE* right0 = right;
unsigned h;
TYPE* sw = swap;
while (sw < swap + SWSZ - 16 && left <= right - 16) {
for (int i = 16; i--;) {
h = IS_LOWER(*right, piv); *right0 = *sw = *right--; right0 -= !h; sw += h;
}
}
while (sw < swap + SWSZ - 16 && left <= right) {
h = IS_LOWER(*right, piv); *right0 = *sw = *right--; right0 -= !h; sw += h;
}
while (1) {
while (right0 > right + 16 && left <= right - 16) {
for (int i = 16; i--;) {
h = IS_LOWER(*left, piv); *left0 = *right0 = *left++; left0 += h; right0 -= !h;
}
}
if (left > right - 16) break;
while (left0 < left - 16 && left <= right - 16) {
for (int i = 16; i--;) {
h = IS_LOWER(*right, piv); *right0 = *left0 = *right--; right0 -= !h; left0 += h;
}
}
if (left > right - 16) break;
}
while (right0 > right && left <= right) {
h = IS_LOWER(*left, piv); *left0 = *right0 = *left++; left0 += h; right0 -= !h;
}
while (left <= right) {
h = IS_LOWER(*right, piv); *right0 = *left0 = *right--; right0 -= !h; left0 += h;
}
TYPE* sw0 = swap;
while (sw0 < sw) *left0++ = *sw0++;
left0--;
*outerleft = *left0;
*left0 = piv;
return left0;
}
double t(void) {
static double t0;
struct timeval tv;
gettimeofday(&tv, NULL);
double h = t0;
t0 = tv.tv_sec + tv.tv_usec / 1000000.0;
return t0 - h;
}
unsigned chksum;
void init(TYPE* data, int len) {
unsigned chks = 0;
for (int i = 0; i < len; i++) {
data[i] = rand() / 128;
chks += *(unsigned*)&data[i];
}
chksum = chks;
}
void test(TYPE* data, int len) {
unsigned chks = data[0];
for (int i = 1; i < len; i++) {
if (data[i] < data[i - 1]) {
printf("ERROR ORDER\n");
break;
}
chks += *(unsigned*)&data[i];
}
if (chks != chksum) printf("ERROR CHKS\n");
}
#define SIZE (50 * 1000000)
TYPE data[SIZE];
int main(void) {
init(data, SIZE);
printf("Sorting %d million numbers with Branchless Quicksort ...\n",
SIZE / 1000000);
t();
sort(data, SIZE);
printf("%.3fs\n", t());
test(data, SIZE);
return 0;
}
Sorting 50 million numbers with Branchless Quicksort ...
1.151s
The algorithm is extended to use multiple CPU cores.
New threads are created only for large partitions and are limited so that we don’t start too many threads. Smaller partitions are processed locally in the current thread.
// SPDX-License-Identifier: MIT
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#include <pthread.h>
#include <unistd.h>
#include <stdatomic.h>
#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int
#define swap(a, b) do { __typeof__(a) h = a; a = b; b = h; } while (0)
#define INSERTSORT_THRE 16
#define SMALLPART 2048
TYPE* partition_small(TYPE* left, TYPE* right);
TYPE* partition(TYPE* left, TYPE* right);
void insert_sort(TYPE* left, TYPE* right);
void heap_sort(TYPE* left, TYPE* right);
int max_threads;
atomic_int n_threads;
pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
void* sort_thr(void *arg) {
TYPE** stack = (TYPE**)arg;
int sp = 0;
TYPE* left = stack[sp];
TYPE* right = stack[sp + 1];
while (1) {
int partsz = right - left;
if (partsz < INSERTSORT_THRE) {
insert_sort(left, right);
goto pop_stack;
}
TYPE* l;
TYPE* r;
TYPE* mid;
if (partsz <= SMALLPART) mid = partition_small(left, right);
else mid = partition(left, right);
if (partsz > SMALLPART && mid - left < 100) {
heap_sort(left, right);
goto pop_stack;
}
if (mid < left + partsz / 2) {
l = mid + 1;
r = right;
right = mid - 1;
}
else {
l = left;
r = mid - 1;
left = mid + 1;
}
// soft limit: may exceed max_threads
TYPE** stack_new = NULL;
if (r - l > 1000000 && n_threads < max_threads) {
// start a new thread - max_threads is a soft limit
stack_new = malloc(64 * sizeof(TYPE*));
// stack size 64 is sufficient for n <= 2^32
if (stack_new) {
stack_new[0] = l;
stack_new[1] = r;
pthread_mutex_lock(&mtx);
pthread_t thread;
if (pthread_create(&thread, NULL, sort_thr, stack_new) == 0) {
n_threads += 1;
pthread_detach(thread);
}
else {
free(stack_new);
stack_new = NULL;
}
pthread_mutex_unlock(&mtx);
}
}
if (stack_new == NULL) {
stack[sp] = l;
stack[sp + 1] = r;
sp += 2;
}
continue;
pop_stack:
if (sp == 0) break;
sp -= 2;
left = stack[sp];
right = stack[sp + 1];
}
free(stack);
pthread_mutex_lock(&mtx);
n_threads -= 1;
if (n_threads == 0) pthread_cond_signal(&cond);
pthread_mutex_unlock(&mtx);
return NULL;
}
void sort(TYPE* data, int len) {
int n_cpus = sysconf(_SC_NPROCESSORS_ONLN);
// printf("CPUs: %d\n", n_cpus);
if (n_cpus > 0) max_threads = n_cpus * 2;
else max_threads = 8;
for (TYPE* p = data + 1; p < data + len; p++) {
if (*p < *data) swap(*p, *data);
}
pthread_t thread;
TYPE** stack = malloc(64 * sizeof(TYPE*));
stack[0] = data + 1;
stack[1] = data + len - 1;
n_threads = 1;
pthread_create(&thread, NULL, sort_thr, stack);
pthread_mutex_lock(&mtx);
while (n_threads != 0) pthread_cond_wait(&cond, &mtx);
pthread_mutex_unlock(&mtx);
}
void heap_sort(TYPE* left, TYPE* right) {
long n = right - left + 1;
if (n < 2) return;
for (long i = n / 2, j, k; ; ) {
if (i > 0) {
k = left[--i];
}
else {
n -= 1;
if (n == 0) return;
k = left[n];
left[n] = left[0];
}
j = i;
while (j * 2 + 1 < n) {
long child = j * 2 + 1;
if (child + 1 < n && IS_LOWER(left[child], left[child + 1])) child++;
if (!IS_LOWER(k, left[child])) break;
left[j] = left[child];
j = child;
}
left[j] = k;
}
}
#define sort2(a, b) do { \
if (IS_LOWER(b, a)) swap(a, b); \
} while(0) \
#define sort3(a, b, c) do { \
sort2(a, b); \
sort2(b, c); \
sort2(a, b); \
} while(0) \
#define sort4(a, b, c, d) do { \
sort2(a, b); \
sort2(c, d); \
sort2(a, c); \
sort2(b, d); \
sort2(b, c); \
} while(0) \
#define sort5(a, b, c, d, e) do { \
sort2(b, c); \
sort2(d, e); \
sort2(b, d); \
sort2(a, c); \
sort2(a, d); \
sort2(c, e); \
sort2(a, b); \
sort2(c, d); \
sort2(b, c); \
} while(0) \
void insert_sort(TYPE* left, TYPE* right) {
int h = right - left;
if (h >= 4) {
sort5(*left, *(left + 1), *(left + 2), *(left + 3), *(left + 4));
for (TYPE* pi = left + 5; pi <= right; pi++) {
TYPE h = *pi;
TYPE* pj = pi - 1;
while (IS_LOWER(h, *pj)) {
*(pj + 1) = *pj;
pj -= 1;
}
*(pj + 1) = h;
}
}
else {
switch (h) {
case 3:
sort4(*left, *(left + 1), *(left + 2), *(left + 3));
break;
case 2:
sort3(*left, *(left + 1), *(left + 2));
break;
case 1:
sort2(*left, *(left + 1));
break;
}
}
}
#define SELSZ 16
void quicksel(TYPE* left, TYPE* right) {
TYPE _Alignas(64) swap[2 * SELSZ];
memcpy(swap, left + 1, SELSZ * sizeof(TYPE));
memcpy(left + 1, right + 1, SELSZ * sizeof(TYPE));
TYPE* l0 = left - SELSZ;
TYPE* r0 = left + SELSZ;
while (1) {
TYPE piv = *l0;
TYPE* l = l0 + 1;
TYPE* r = r0;
while (1) {
while (IS_LOWER(*l, piv)) l++;
while (IS_LOWER(piv, *r)) r--;
if (l >= r) break;
swap(*l, *r);
l++;
r--;
}
*l0 = *r;
*r = piv;
if (r == left) break;
if (r < left) l0 = r + 1;
else r0 = r - 1;
}
memcpy(right + 1, left + 1, SELSZ * sizeof(TYPE));
memcpy(left + 1, swap, SELSZ * sizeof(TYPE));
}
TYPE* partition_small(TYPE* left, TYPE* right) {
unsigned n = right - left;
TYPE* outerleft = left;
TYPE* pivp = left + n / 2;
sort3(*(left + 1), *pivp, *right);
left += 2;
right--;
TYPE piv = *pivp;
*pivp = *outerleft;
TYPE _Alignas(64) swap[SMALLPART];
TYPE* sw = swap;
TYPE* left0 = left;
unsigned h;
while (left <= right - 8) {
for (int i = 8; i--;) {
h = IS_LOWER(*left, piv); *left0 = *sw = *left++; left0 += h; sw += !h;
}
}
while (left <= right) {
h = IS_LOWER(*left, piv); *left0 = *sw = *left++; left0 += h; sw += !h;
}
TYPE* mid = left0 - 1;
TYPE* sw0 = swap;
while (sw0 < sw) *left0++ = *sw0++;
*outerleft = *mid;
*mid = piv;
return mid;
}
TYPE* partition(TYPE* left, TYPE* right) {
unsigned n = right - left;
// n > SMALLPART
TYPE* outerleft = left;
TYPE* pivp = left + n / 2;
TYPE piv;
left++;
swap(*left, *pivp);
left += SELSZ;
right -= SELSZ;
quicksel(left, right);
piv = *left;
*left = *outerleft;
#define SWSZ 1024
TYPE _Alignas(64) swap[SWSZ];
TYPE* left0 = left;
TYPE* right0 = right;
unsigned h;
TYPE* sw = swap;
while (sw < swap + SWSZ - 16 && left <= right - 16) {
for (int i = 16; i--;) {
h = IS_LOWER(*right, piv); *right0 = *sw = *right--; right0 -= !h; sw += h;
}
}
while (sw < swap + SWSZ - 16 && left <= right) {
h = IS_LOWER(*right, piv); *right0 = *sw = *right--; right0 -= !h; sw += h;
}
while (1) {
while (right0 > right + 16 && left <= right - 16) {
for (int i = 16; i--;) {
h = IS_LOWER(*left, piv); *left0 = *right0 = *left++; left0 += h; right0 -= !h;
}
}
if (left > right - 16) break;
while (left0 < left - 16 && left <= right - 16) {
for (int i = 16; i--;) {
h = IS_LOWER(*right, piv); *right0 = *left0 = *right--; right0 -= !h; left0 += h;
}
}
if (left > right - 16) break;
}
while (right0 > right && left <= right) {
h = IS_LOWER(*left, piv); *left0 = *right0 = *left++; left0 += h; right0 -= !h;
}
while (left <= right) {
h = IS_LOWER(*right, piv); *right0 = *left0 = *right--; right0 -= !h; left0 += h;
}
TYPE* sw0 = swap;
while (sw0 < sw) *left0++ = *sw0++;
left0--;
*outerleft = *left0;
*left0 = piv;
return left0;
}
double t(void) {
static double t0;
struct timeval tv;
gettimeofday(&tv, NULL);
double h = t0;
t0 = tv.tv_sec + tv.tv_usec / 1000000.0;
return t0 - h;
}
unsigned chksum;
void init(TYPE* data, int len) {
unsigned chks = 0;
for (int i = 0; i < len; i++) {
data[i] = rand() / 128;
chks += *(unsigned*)&data[i];
}
chksum = chks;
}
void test(TYPE* data, int len) {
unsigned chks = data[0];
for (int i = 1; i < len; i++) {
if (data[i] < data[i - 1]) {
printf("ERROR ORDER\n");
break;
}
chks += *(unsigned*)&data[i];
}
if (chks != chksum) printf("ERROR CHKS\n");
}
#define SIZE (50 * 1000000)
TYPE data[SIZE];
int main(void) {
init(data, SIZE);
printf("Sorting %d million numbers with Threaded Branchless Quicksort ...\n",
SIZE / 1000000);
t();
sort(data, SIZE);
printf("%.3fs\n", t());
test(data, SIZE);
return 0;
}
Sorting 50 million numbers with Threaded Branchless Quicksort ...
0.275s
Multithreading and the avoidance of branch mispredictions can significantly improve the application performance of programs.