When ‘if’ slows you down, avoid it
New threads are created only for large partitions and are limited so that we don’t start too many threads. Smaller partitions are processed in the current thread.
// SPDX-License-Identifier: MIT
// (c) 2026 christof.kaser@gmail.com
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#include <pthread.h>
#include <unistd.h>
#include <stdatomic.h>
#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int
#define swap(a, b) do { __typeof__(a) h = a; a = b; b = h; } while (0)
#define NAME "Threaded Branchless Quicksort"
TYPE* partition(TYPE* left, TYPE* right);
void sorting_network(TYPE* left, int size);
void heap_sort(TYPE* left, TYPE* right);
TYPE* partition_small(TYPE* left, TYPE* right);
#define SMALLPART 3072
int max_threads;
atomic_int n_threads;
pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
void* sort_thr(void *arg) {
TYPE** stack = (TYPE**)arg;
int sp = 0;
TYPE* left = stack[sp];
TYPE* right = stack[sp + 1];
while (1) {
int partsz = right - left;
if (partsz <= 15) {
sorting_network(left, partsz);
goto pop_stack;
}
TYPE* l;
TYPE* r;
TYPE* mid;
if (partsz <= SMALLPART) mid = partition_small(left, right);
else mid = partition(left, right);
if (partsz > SMALLPART && mid - left < 100) {
heap_sort(left, right);
goto pop_stack;
}
if (mid < left + partsz / 2) {
l = mid + 1;
r = right;
right = mid - 1;
}
else {
l = left;
r = mid - 1;
left = mid + 1;
}
// soft limit: may exceed max_threads
TYPE** stack_new = NULL;
if (r - l > 1000000 && n_threads < max_threads) {
// start a new thread - max_threads is a soft limit
stack_new = malloc(64 * sizeof(TYPE*));
// stack size 64 is sufficient for n <= 2^32
if (stack_new) {
stack_new[0] = l;
stack_new[1] = r;
pthread_mutex_lock(&mtx);
pthread_t thread;
if (pthread_create(&thread, NULL, sort_thr, stack_new) == 0) {
n_threads += 1;
pthread_detach(thread);
}
else {
free(stack_new);
stack_new = NULL;
}
pthread_mutex_unlock(&mtx);
}
}
if (stack_new == NULL) {
stack[sp] = l;
stack[sp + 1] = r;
sp += 2;
}
continue;
pop_stack:
if (sp == 0) break;
sp -= 2;
left = stack[sp];
right = stack[sp + 1];
}
free(stack);
pthread_mutex_lock(&mtx);
n_threads -= 1;
if (n_threads == 0) pthread_cond_signal(&cond);
pthread_mutex_unlock(&mtx);
return NULL;
}
void sort(TYPE* data, int len) {
int n_cpus = sysconf(_SC_NPROCESSORS_ONLN);
// printf("CPUs: %d\n", n_cpus);
if (n_cpus > 0) max_threads = n_cpus * 2;
else max_threads = 8;
for (TYPE* p = data + 1; p < data + len; p++) {
if (*p < *data) swap(*p, *data);
}
pthread_t thread;
TYPE** stack = malloc(64 * sizeof(TYPE*));
stack[0] = data + 1;
stack[1] = data + len - 1;
n_threads = 1;
pthread_create(&thread, NULL, sort_thr, stack);
pthread_mutex_lock(&mtx);
while (n_threads != 0) pthread_cond_wait(&cond, &mtx);
pthread_mutex_unlock(&mtx);
}
// ---------------------- sorting network ----------------------------
#define sort2(a, b) do { \
__typeof__(a) x = a; \
__typeof__(a) y = b; \
unsigned m = IS_LOWER(x, y); \
a = m ? x : y; \
b = m ? y : x; \
} while(0)
// https://bertdobbelaere.github.io/sorting_networks.html
#define sort3(a, b, c) do { \
sort2(a, b); sort2(b, c); sort2(a, b); \
} while(0)
#define sort4(a, b, c, d) do { \
sort2(a, b); sort2(c, d); sort2(a, c); \
sort2(b, d); sort2(b, c); \
} while(0)
#define sort5(a, b, c, d, e) do { \
sort2(b, c); sort2(d, e); sort2(b, d); \
sort2(a, c); sort2(a, d); sort2(c, e); \
sort2(a, b); sort2(c, d); sort2(b, c); \
} while(0)
#define sort6(a, b, c, d, e, f) do { \
sort2(a, b); sort2(c, d); sort2(e, f); \
sort2(a, c); sort2(b, d); sort2(e, f); \
sort2(a, e); sort2(b, f); sort2(c, e); \
sort2(d, f); sort2(b, c); sort2(d, e); \
sort2(c, d); \
} while(0)
#define sort7(a, b, c, d, e, f, g) do { \
sort2(a, b); sort2(c, d); sort2(a, c); \
sort2(b, d); sort2(b, c); sort2(e, f); \
sort2(e, g); sort2(f, g); sort2(a, e); \
sort2(b, f); sort2(c, g); sort2(b, e); \
sort2(d, g); sort2(c, e); sort2(b, c); \
sort2(d, f); sort2(d, e); \
} while(0)
#define sort8(a,b,c,d,e,f,g,h) do { \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); \
sort2(b,c); sort2(f,g); \
sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
sort2(c,e); sort2(d,f); \
sort2(b,c); sort2(d,e); sort2(f,g); \
} while (0)
#define sort9(a,b,c,d,e,f,g,h,i) do { \
sort2(a,d); sort2(b,h); sort2(c,f); sort2(e,i); \
sort2(a,h); sort2(c,e); sort2(d,i); sort2(f,g); \
sort2(a,c); sort2(b,d); sort2(e,f); sort2(h,i); \
sort2(b,e); sort2(d,g); sort2(f,h); \
sort2(a,b); sort2(c,e); sort2(d,f); sort2(g,i); \
sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(b,c); sort2(d,e); sort2(f,g); \
} while (0)
#define sort10(a,b,c,d,e,f,g,h,i,j) do { \
sort2(a,i); sort2(b,j); sort2(c,h); sort2(d,f); sort2(e,g); \
sort2(a,c); sort2(b,e); sort2(f,i); sort2(h,j); \
sort2(a,d); sort2(c,e); sort2(f,h); sort2(g,j); \
sort2(a,b); sort2(d,g); sort2(i,j); \
sort2(b,f); sort2(c,d); sort2(e,i); sort2(g,h); \
sort2(b,c); sort2(d,f); sort2(e,g); sort2(h,i); \
sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(d,e); sort2(f,g); \
} while (0)
#define sort11(a,b,c,d,e,f,g,h,i,j,k) do { \
sort2(a,j); sort2(b,g); sort2(c,e); sort2(d,h); sort2(f,i); \
sort2(a,b); sort2(d,f); sort2(e,k); sort2(g,j); sort2(h,i); \
sort2(b,d); sort2(c,f); sort2(e,h); sort2(i,k); \
sort2(a,e); sort2(b,c); sort2(d,h); sort2(f,j); sort2(g,i); \
sort2(a,b); sort2(c,g); sort2(e,f); sort2(h,i); sort2(j,k); \
sort2(c,e); sort2(d,g); sort2(f,h); sort2(i,j); \
sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); \
sort2(c,d); sort2(e,f); sort2(g,h); \
} while (0)
#define sort12(a,b,c,d,e,f,g,h,i,j,k,l) do { \
sort2(a,i); sort2(b,h); sort2(c,g); sort2(d,l); sort2(e,k); sort2(f,j); \
sort2(a,c); sort2(b,e); sort2(d,f); sort2(g,i); sort2(h,k); sort2(j,l); \
sort2(a,b); sort2(c,j); sort2(e,h); sort2(f,g); sort2(k,l); \
sort2(b,d); sort2(c,h); sort2(e,j); sort2(i,k); \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
sort2(b,c); sort2(d,f); sort2(g,i); sort2(j,k); \
sort2(c,e); sort2(d,g); sort2(f,i); sort2(h,j); \
sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
} while (0)
#define sort13(a,b,c,d,e,f,g,h,i,j,k,l,m) do { \
sort2(a,l); sort2(b,h); sort2(c,e); sort2(d,f); sort2(i,j); sort2(k,m); \
sort2(a,c); sort2(d,g); sort2(e,m); sort2(f,h); sort2(i,k); \
sort2(a,i); sort2(b,d); sort2(c,f); sort2(e,j); sort2(g,l); sort2(h,m); \
sort2(a,b); sort2(c,k); sort2(d,i); sort2(e,g); sort2(j,l); \
sort2(b,d); sort2(c,e); sort2(f,k); sort2(g,i); sort2(h,j); sort2(l,m); \
sort2(b,c); sort2(d,e); sort2(f,i); sort2(g,j); sort2(h,k); \
sort2(c,d); sort2(e,h); sort2(f,g); sort2(i,l); sort2(j,k); \
sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
} while (0)
#define sort14(a,b,c,d,e,f,g,h,i,j,k,l,m,n) do { \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(i,j); sort2(k,l); sort2(m,n); \
sort2(a,c); sort2(b,d); sort2(e,i); sort2(f,j); \
sort2(k,m); sort2(l,n); \
sort2(a,k); sort2(b,g); sort2(c,l); sort2(d,n); \
sort2(f,i); sort2(h,m); \
sort2(b,e); sort2(c,i); sort2(d,g); \
sort2(f,l); sort2(h,k); sort2(j,m); \
sort2(a,b); sort2(d,j); sort2(e,k); \
sort2(f,h); sort2(g,i); sort2(m,n); \
sort2(b,f); sort2(c,e); sort2(d,h); \
sort2(g,k); sort2(i,m); sort2(j,l); \
sort2(b,c); sort2(d,f); sort2(e,g); \
sort2(h,j); sort2(i,k); sort2(l,m); \
sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(i,j); sort2(k,l); \
sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
} while (0)
#define sort15(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) do { \
sort2(a,g); sort2(b,k); sort2(c,o); sort2(d,j); \
sort2(e,m); sort2(f,n); sort2(h,l); \
sort2(a,h); sort2(c,f); sort2(d,e); \
sort2(g,l); sort2(i,k); sort2(j,m); sort2(n,o); \
sort2(b,n); sort2(c,d); sort2(e,g); \
sort2(f,j); sort2(h,i); sort2(k,o); sort2(l,m); \
sort2(a,d); sort2(b,e); sort2(f,h); \
sort2(g,n); sort2(i,j); sort2(k,l); sort2(m,o); \
sort2(a,c); sort2(b,f); sort2(d,i); \
sort2(e,g); sort2(h,k); sort2(j,l); sort2(m,n); \
sort2(a,b); sort2(c,f); sort2(d,k); \
sort2(e,i); sort2(g,h); sort2(j,m); sort2(l,n); \
sort2(b,c); sort2(d,e); sort2(f,g); \
sort2(h,j); sort2(i,k); sort2(l,m); \
sort2(d,f); sort2(e,g); sort2(h,i); sort2(j,k); \
sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(i,j); sort2(k,l); \
} while (0)
#define sort16(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) do { \
sort2(a,f); sort2(b,e); sort2(c,m); sort2(d,n); \
sort2(g,h); sort2(i,j); sort2(k,p); sort2(l,o); \
sort2(a,c); sort2(b,k); sort2(d,g); sort2(e,h); \
sort2(f,o); sort2(i,l); sort2(j,m); sort2(n,p); \
sort2(a,i); sort2(b,d); sort2(c,l); sort2(e,n); \
sort2(f,j); sort2(g,k); sort2(h,p); sort2(m,o); \
sort2(a,b); sort2(c,e); sort2(d,i); sort2(f,g); \
sort2(h,m); sort2(j,k); sort2(l,n); sort2(o,p); \
sort2(b,d); sort2(c,f); sort2(e,i); sort2(g,j); \
sort2(h,l); sort2(k,n); sort2(m,o); \
sort2(b,c); sort2(d,f); sort2(e,l); sort2(g,i); \
sort2(h,j); sort2(k,m); sort2(n,o); \
sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); \
sort2(k,l); sort2(m,n); \
sort2(e,g); sort2(f,h); sort2(i,k); sort2(j,l); \
sort2(d,e); sort2(f,g); sort2(h,i); \
sort2(j,k); sort2(l,m); \
} while (0)
#define sort17(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q) do { \
sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); \
sort2(j,k); sort2(l,m); sort2(n,o); sort2(p,q); \
sort2(b,d); sort2(c,e); sort2(f,h); sort2(g,i); \
sort2(j,l); sort2(k,m); sort2(n,p); sort2(o,q); \
sort2(b,f); sort2(c,g); sort2(d,h); sort2(e,i); \
sort2(j,n); sort2(k,o); sort2(l,p); sort2(m,q); \
sort2(a,d); sort2(b,n); sort2(c,k); sort2(e,h); \
sort2(f,l); sort2(g,m); sort2(i,j); sort2(o,p); \
sort2(a,n); sort2(b,i); sort2(c,f); sort2(d,g); \
sort2(e,o); sort2(h,p); sort2(j,q); sort2(k,l); \
sort2(a,b); sort2(c,i); sort2(d,e); sort2(f,k); \
sort2(g,n); sort2(h,l); sort2(m,o); \
sort2(b,f); sort2(d,i); sort2(e,k); sort2(g,h); \
sort2(j,m); sort2(l,n); \
sort2(b,c); sort2(e,g); sort2(f,i); sort2(h,k); \
sort2(j,l); sort2(m,o); sort2(n,p); \
sort2(c,d); sort2(e,f); sort2(g,i); sort2(h,j); \
sort2(k,l); sort2(m,n); sort2(o,p); \
sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
sort2(l,m); sort2(n,o); sort2(p,q); \
} while (0)
#define sort18(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r) do { \
sort2(a,g); sort2(b,k); sort2(c,p); sort2(d,f); sort2(e,j); \
sort2(h,q); sort2(i,n); sort2(l,r); sort2(m,o); \
sort2(a,m); sort2(b,e); sort2(d,l); sort2(f,r); sort2(g,o); \
sort2(h,i); sort2(j,k); sort2(n,q); \
sort2(b,n); sort2(c,h); sort2(e,q); sort2(g,j); \
sort2(i,l); sort2(k,p); \
sort2(a,b); sort2(c,d); sort2(e,m); sort2(f,n); \
sort2(h,j); sort2(i,k); sort2(o,p); sort2(q,r); \
sort2(a,c); sort2(b,l); sort2(d,e); sort2(f,h); \
sort2(g,q); sort2(k,m); sort2(n,o); sort2(p,r); \
sort2(b,i); sort2(e,k); sort2(f,g); sort2(h,n); \
sort2(j,q); sort2(l,m); \
sort2(b,d); sort2(c,f); sort2(e,h); sort2(g,i); \
sort2(j,l); sort2(k,n); sort2(m,p); sort2(o,q); \
sort2(b,c); sort2(d,f); sort2(e,g); sort2(h,j); \
sort2(i,k); sort2(l,n); sort2(m,o); sort2(p,q); \
sort2(c,d); sort2(f,i); sort2(g,h); sort2(j,m); \
sort2(k,l); sort2(o,p); \
sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
sort2(l,m); sort2(n,o); \
sort2(e,f); sort2(g,h); sort2(i,j); \
sort2(k,l); sort2(m,n); \
} while (0)
#define sort19(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s) do { \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(i,k); sort2(l,m); sort2(n,o); sort2(p,q); sort2(r,s); \
sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); \
sort2(i,j); sort2(l,n); sort2(m,o); sort2(p,r); sort2(q,s); \
sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
sort2(j,k); sort2(l,p); sort2(m,q); sort2(n,r); sort2(o,s); \
sort2(a,l); sort2(b,i); sort2(c,n); sort2(d,r); \
sort2(e,k); sort2(f,g); sort2(j,q); sort2(m,p); \
sort2(b,c); sort2(d,n); sort2(e,m); sort2(f,o); \
sort2(g,q); sort2(h,k); sort2(i,p); \
sort2(a,b); sort2(c,l); sort2(d,j); sort2(f,m); \
sort2(g,p); sort2(h,n); sort2(k,s); sort2(o,r); \
sort2(b,e); sort2(d,i); sort2(f,l); sort2(g,j); \
sort2(h,m); sort2(k,n); sort2(o,p); sort2(q,r); \
sort2(c,e); sort2(d,f); sort2(g,h); \
sort2(i,l); sort2(j,m); sort2(k,o); sort2(n,p); \
sort2(c,d); sort2(e,f); sort2(g,i); sort2(h,j); \
sort2(k,l); sort2(m,o); sort2(n,q); sort2(p,r); \
sort2(b,c); sort2(e,g); sort2(f,i); sort2(h,k); \
sort2(j,l); sort2(m,n); sort2(o,q); \
sort2(d,e); sort2(f,g); sort2(h,i); \
sort2(j,k); sort2(l,m); sort2(n,o); sort2(p,q); \
} while (0)
#define sort20(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t) do { \
sort2(a,m); sort2(b,n); sort2(c,o); sort2(d,p); sort2(e,q); \
sort2(f,r); sort2(g,s); sort2(h,t); sort2(i,k); sort2(j,l); \
sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); \
sort2(i,j); sort2(k,l); sort2(m,o); sort2(n,p); sort2(q,s); sort2(r,t); \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(m,n); sort2(o,p); sort2(q,r); sort2(s,t); \
sort2(a,e); sort2(b,m); sort2(c,q); sort2(d,r); \
sort2(f,i); sort2(g,j); sort2(h,s); \
sort2(k,n); sort2(l,o); sort2(p,t); \
sort2(b,g); sort2(d,k); sort2(e,f); sort2(h,l); \
sort2(i,m); sort2(j,q); sort2(n,s); sort2(o,p); \
sort2(a,e); sort2(c,i); sort2(d,j); sort2(g,h); \
sort2(k,q); sort2(l,r); sort2(m,n); sort2(p,t); \
sort2(b,e); sort2(d,g); sort2(f,i); sort2(h,k); \
sort2(j,m); sort2(l,o); sort2(n,q); sort2(p,s); \
sort2(c,d); sort2(e,f); sort2(g,i); sort2(h,j); \
sort2(k,m); sort2(l,n); sort2(o,p); sort2(q,r); \
sort2(c,e); sort2(d,g); sort2(f,h); sort2(i,k); \
sort2(j,l); sort2(m,o); sort2(n,q); sort2(p,r); \
sort2(b,c); sort2(d,f); sort2(g,h); sort2(i,j); \
sort2(k,l); sort2(m,n); sort2(o,q); sort2(r,s); \
sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
sort2(l,m); sort2(n,o); sort2(p,q); \
} while (0)
#define sort21(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u) do { \
sort2(a,h); sort2(b,k); sort2(d,f); sort2(e,i); sort2(g,n); \
sort2(j,t); sort2(l,o); sort2(m,r); sort2(p,q); sort2(s,u); \
sort2(a,l); sort2(b,p); sort2(c,m); sort2(d,e); sort2(f,i); \
sort2(g,j); sort2(h,o); sort2(k,q); sort2(n,t); sort2(r,u); \
sort2(a,g); sort2(b,d); sort2(c,s); sort2(e,p); sort2(f,k); \
sort2(i,q); sort2(l,r); sort2(m,n); sort2(o,u); \
sort2(c,g); sort2(f,m); sort2(h,s); sort2(i,o); \
sort2(j,l); sort2(k,r); sort2(n,t); sort2(q,u); \
sort2(b,c); sort2(e,h); sort2(f,j); sort2(g,r); \
sort2(k,n); sort2(l,m); sort2(o,t); sort2(p,s); \
sort2(a,c); sort2(d,g); sort2(e,f); sort2(h,k); \
sort2(i,l); sort2(j,p); sort2(m,q); sort2(n,s); \
sort2(o,r); sort2(t,u); \
sort2(a,b); sort2(c,d); sort2(f,j); sort2(g,m); \
sort2(h,i); sort2(l,o); sort2(n,p); sort2(q,t); sort2(r,s); \
sort2(b,c); sort2(d,j); sort2(g,n); sort2(k,l); \
sort2(m,p); sort2(q,r); sort2(s,t); \
sort2(b,e); sort2(c,f); sort2(d,h); sort2(g,k); \
sort2(i,j); sort2(l,m); sort2(n,o); sort2(r,s); \
sort2(c,e); sort2(f,g); sort2(h,i); sort2(j,l); \
sort2(k,n); sort2(m,p); sort2(o,q); \
sort2(d,e); sort2(f,h); sort2(g,i); sort2(j,k); \
sort2(l,n); sort2(m,o); sort2(p,q); \
sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
sort2(m,n); sort2(o,p); sort2(q,r); \
} while (0)
#define sort22(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v) do { \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
sort2(m,n); sort2(o,p); sort2(q,r); sort2(s,t); sort2(u,v); \
sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); sort2(i,k); sort2(l,n); \
sort2(o,q); sort2(p,r); sort2(s,u); sort2(t,v); \
sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); sort2(i,m); sort2(j,n); \
sort2(o,s); sort2(p,t); sort2(q,u); sort2(r,v); \
sort2(a,o); sort2(b,p); sort2(c,s); sort2(d,t); sort2(e,q); sort2(f,r); \
sort2(g,u); sort2(h,v); sort2(j,l); sort2(k,m); \
sort2(a,i); sort2(c,k); sort2(e,o); sort2(f,m); sort2(g,p); sort2(h,r); \
sort2(j,q); sort2(l,t); sort2(n,v); \
sort2(b,j); sort2(c,e); sort2(d,q); sort2(f,s); sort2(g,k); sort2(h,n); \
sort2(i,o); sort2(l,p); sort2(m,u); sort2(r,t); \
sort2(b,i); sort2(d,l); sort2(e,f); sort2(h,m); sort2(j,o); sort2(k,s); \
sort2(n,u); sort2(q,r); \
sort2(b,c); sort2(d,f); sort2(e,i); sort2(g,j); sort2(h,l); sort2(k,o); \
sort2(m,p); sort2(n,r); sort2(q,s); sort2(t,u); \
sort2(c,e); sort2(d,g); sort2(f,j); sort2(h,k); sort2(l,o); sort2(m,q); \
sort2(p,s); sort2(r,t); \
sort2(d,e); sort2(f,h); sort2(g,i); sort2(j,l); sort2(k,m); sort2(n,p); \
sort2(o,q); sort2(r,s); \
sort2(f,g); sort2(h,i); sort2(j,k); sort2(l,m); sort2(n,o); sort2(p,q); \
sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); sort2(m,n); sort2(o,p); \
sort2(q,r); \
} while (0)
#define sort23(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w) do { \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
sort2(m,n); sort2(o,p); sort2(q,r); sort2(s,t); sort2(u,v); \
sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); sort2(i,k); sort2(j,l); \
sort2(m,o); sort2(n,p); sort2(r,t); sort2(s,u); sort2(v,w); \
sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
sort2(i,m); sort2(j,n); sort2(k,o); sort2(l,p); \
sort2(q,v); sort2(r,w); \
sort2(b,k); sort2(c,j); sort2(d,l); sort2(g,t); \
sort2(m,r); sort2(o,w); sort2(q,s); sort2(u,v); \
sort2(a,q); sort2(b,c); sort2(d,v); sort2(e,r); \
sort2(f,o); sort2(g,n); sort2(h,w); sort2(j,s); \
sort2(k,u); sort2(p,t); \
sort2(b,k); sort2(c,j); sort2(d,r); sort2(e,m); \
sort2(f,s); sort2(g,u); sort2(h,p); sort2(i,q); \
sort2(l,o); sort2(n,v); sort2(t,w); \
sort2(a,i); sort2(b,e); sort2(c,k); sort2(d,j); \
sort2(f,g); sort2(l,v); sort2(m,q); sort2(n,u); \
sort2(o,p); sort2(r,s); \
sort2(c,i); sort2(d,f); sort2(e,m); sort2(g,j); \
sort2(h,l); sort2(k,q); sort2(n,r); sort2(p,v); \
sort2(s,u); \
sort2(b,c); sort2(e,i); sort2(f,k); sort2(g,m); \
sort2(h,n); sort2(j,q); sort2(l,s); sort2(o,r); \
sort2(p,t); \
sort2(c,e); sort2(d,f); sort2(g,i); sort2(h,j); \
sort2(k,m); sort2(l,n); sort2(o,q); sort2(p,u); \
sort2(r,s); sort2(t,v); \
sort2(d,g); sort2(f,i); sort2(h,k); sort2(j,m); \
sort2(l,o); sort2(n,q); sort2(p,r); sort2(s,u); \
sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
sort2(l,m); sort2(n,o); sort2(p,q); sort2(r,s); \
sort2(t,u); \
} while (0)
#define sort24(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x) do { \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
sort2(m,n); sort2(o,p); sort2(q,r); sort2(s,t); sort2(u,v); sort2(w,x); \
sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); sort2(i,k); sort2(j,l); \
sort2(m,o); sort2(n,p); sort2(q,s); sort2(r,t); sort2(u,w); sort2(v,x); \
sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
sort2(i,m); sort2(j,n); sort2(k,o); sort2(l,p); \
sort2(q,u); sort2(r,v); sort2(s,w); sort2(t,x); \
sort2(a,q); sort2(b,s); sort2(c,r); sort2(d,t); \
sort2(e,u); sort2(f,w); sort2(g,v); sort2(h,x); \
sort2(j,k); sort2(n,o); \
sort2(c,k); sort2(d,l); sort2(f,s); sort2(g,o); \
sort2(h,p); sort2(i,q); sort2(j,r); sort2(m,u); \
sort2(n,v); \
sort2(a,i); sort2(b,j); sort2(c,m); sort2(d,u); \
sort2(e,q); sort2(f,n); sort2(g,r); sort2(h,t); \
sort2(k,s); sort2(l,v); sort2(o,w); sort2(p,x); \
sort2(b,i); sort2(d,q); sort2(e,m); sort2(f,k); \
sort2(g,j); sort2(h,u); sort2(l,t); sort2(n,s); \
sort2(o,r); sort2(p,w); \
sort2(c,e); sort2(d,f); sort2(h,n); sort2(j,m); \
sort2(k,q); sort2(l,o); sort2(s,u); sort2(t,v); \
sort2(b,c); sort2(e,i); sort2(f,j); sort2(g,k); \
sort2(h,l); sort2(m,q); sort2(n,r); sort2(o,s); \
sort2(p,t); sort2(v,w); \
sort2(c,e); sort2(d,i); sort2(f,g); sort2(h,j); \
sort2(k,m); sort2(l,n); sort2(o,q); sort2(p,u); \
sort2(r,s); sort2(t,v); \
sort2(d,f); sort2(g,i); sort2(h,k); sort2(j,m); \
sort2(l,o); sort2(n,q); sort2(p,r); sort2(s,u); \
sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
sort2(l,m); sort2(n,o); sort2(p,q); sort2(r,s); sort2(t,u); \
} while (0)
void sorting_network(TYPE* l, int partsz_min_1) {
switch (partsz_min_1) {
case 23:
sort24(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
l[16], l[17], l[18], l[19], l[20], l[21], l[22], l[23]);
break;
case 22:
sort23(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
l[16], l[17], l[18], l[19], l[20], l[21], l[22]);
break;
case 21:
sort22(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
l[16], l[17], l[18], l[19], l[20], l[21]);
break;
case 20:
sort21(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
l[16], l[17], l[18], l[19], l[20]);
break;
case 19:
sort20(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
l[16], l[17], l[18], l[19]);
break;
case 18:
sort19(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15],
l[16], l[17], l[18]);
break;
case 17:
sort18(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15], l[16], l[17]);
break;
case 16:
sort17(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15], l[16]);
break;
case 15:
sort16(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14], l[15]);
break;
case 14:
sort15(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13], l[14]);
break;
case 13:
sort14(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12], l[13]);
break;
case 12:
sort13(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11], l[12]);
break;
case 11:
sort12(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7],
l[8], l[9], l[10], l[11]);
break;
case 10:
sort11(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9], l[10]);
break;
case 9:
sort10(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9]);
break;
case 8:
sort9(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8]);
break;
case 7:
sort8(l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7]);
break;
case 6:
sort7(l[0], l[1], l[2], l[3], l[4], l[5], l[6]);
break;
case 5:
sort6(l[0], l[1], l[2], l[3], l[4], l[5]);
break;
case 4:
sort5(l[0], l[1], l[2], l[3], l[4]);
break;
case 3:
sort4(l[0], l[1], l[2], l[3]);
break;
case 2:
sort3(l[0], l[1], l[2]);
break;
case 1:
sort2(l[0], l[1]);
break;
case 0:
break;
}
}
// ---------------------- partition ----------------------------
#define med5(a, b, c, d, e) do { \
sort2(a, b); sort2(c, d); sort2(a, c); \
sort2(b, d); sort2(b, c); sort2(c, e); \
sort2(b, c); \
} while(0)
TYPE* partition_small(TYPE* restrict left, TYPE* restrict right) {
TYPE* outerleft = left;
TYPE* pivp = left + (right - left) / 2;
med5(left[1], left[2], *pivp, right[-1], *right);
left += 3;
right -= 2;
TYPE piv = *pivp;
*pivp = *outerleft;
TYPE _Alignas(64) swbuf[SMALLPART];
TYPE* sw = swbuf;
TYPE* left0 = left;
while (left <= right) {
unsigned h = IS_LOWER(*left, piv);
*left0 = *sw = *left++;
left0 += h;
sw += !h;
}
memcpy(left0, swbuf, (sw - swbuf) * sizeof(TYPE));
left0 -= 1;
*outerleft = *left0;
*left0 = piv;
return left0;
}
TYPE* partition(TYPE* restrict left, TYPE* restrict right) {
TYPE* outerleft = left;
int n = right - left;
TYPE* pivp = left + n / 2;
TYPE piv;
TYPE* q1 = left + n / 4;
TYPE* q3 = pivp + n / 4;
left += 1;
med5(left[0], left[1], left[2], left[3], left[4]);
med5(q1[-2], q1[-1], q1[0], q1[1], q1[2]);
med5(pivp[-2], pivp[-1], pivp[0], pivp[1], pivp[2]);
med5(q3[-2], q3[-1], q3[0], q3[1], q3[2]);
med5(right[-4], right[-3], right[-2], right[-1], right[0]);
med5(left[2], q1[0], pivp[0], q3[0], right[-2]);
piv = *pivp;
*pivp = *outerleft;
#define SWSZ 1024
TYPE _Alignas(64) swbuf[SWSZ];
TYPE* left0 = left;
TYPE* right0 = right;
TYPE* sw = swbuf;
#define UNROLL 16
while (sw < swbuf + SWSZ - UNROLL && left <= right - UNROLL) {
for (int i = UNROLL; i--;) {
unsigned h = IS_LOWER(*right, piv); *right0 = *sw = *right--; right0 -= !h; sw += h;
}
}
while (sw < swbuf + SWSZ - UNROLL && left <= right) {
unsigned h = IS_LOWER(*right, piv); *right0 = *sw = *right--; right0 -= !h; sw += h;
}
while (1) {
while (right0 > right + UNROLL && left <= right - UNROLL) {
for (int i = UNROLL; i--;) {
unsigned h = IS_LOWER(*left, piv); *left0 = *right0 = *left++; left0 += h; right0 -= !h;
}
}
while (left0 < left - UNROLL && left <= right - UNROLL) {
for (int i = UNROLL; i--;) {
unsigned h = IS_LOWER(*right, piv); *right0 = *left0 = *right--; right0 -= !h; left0 += h;
}
}
if (left > right - UNROLL) break;
}
while (right0 > right && left <= right) {
unsigned h = IS_LOWER(*left, piv); *left0 = *right0 = *left++; left0 += h; right0 -= !h;
}
while (left <= right) {
unsigned h = IS_LOWER(*right, piv); *right0 = *left0 = *right--; right0 -= !h; left0 += h;
}
memcpy(left0, swbuf, (sw - swbuf) * sizeof(TYPE));
*outerleft = *right0;
*right0 = piv;
return right0;
}
// ---------------------- heapsort, testing and main -----------------------------------
void heap_sort(TYPE* restrict left, TYPE* restrict right) {
long n = right - left + 1;
if (n < 2) return;
for (long i = n / 2, j, k; ; ) {
if (i > 0) {
k = left[--i];
}
else {
n -= 1;
if (n == 0) return;
k = left[n];
left[n] = left[0];
}
j = i;
while (j * 2 + 1 < n) {
long child = j * 2 + 1;
if (child + 1 < n && IS_LOWER(left[child], left[child + 1])) child++;
if (!IS_LOWER(k, left[child])) break;
left[j] = left[child];
j = child;
}
left[j] = k;
}
}
unsigned chksum;
unsigned hash_el(void *p, int sz) {
unsigned char* c = (unsigned char*)p;
unsigned h = 0;
for (int i = 0; i < sz; i++) h = (h * 131) + c[i];
return h;
}
void init(TYPE* data, int len) {
chksum = 0;
for (int i = 0; i < len; i++) {
data[i] = rand();
chksum += hash_el(&data[i], sizeof(TYPE));
}
}
void test(TYPE* data, int len) {
unsigned chks = hash_el(&data[0], sizeof(TYPE));;
for (int i = 1; i < len; i++) {
if (data[i] < data[i - 1]) {
printf("ERROR ORDER\n");
break;
}
chks += hash_el(&data[i], sizeof(TYPE));
}
if (chks != chksum) printf("ERROR CHKS\n");
}
double t(void) {
static double t0;
struct timeval tv;
gettimeofday(&tv, NULL);
double h = t0;
t0 = tv.tv_sec + tv.tv_usec / 1000000.0;
return t0 - h;
}
#define SIZE (50 * 1000000)
TYPE data[SIZE];
int main(void) {
srand(time(NULL));
init(data, SIZE);
printf("Sorting %d million numbers " NAME " ...\n", SIZE / 1000000);
t();
sort(data, SIZE);
printf("%.3fs\n", t());
test(data, SIZE);
return 0;
}
The measurements were taken on an Apple M1 system with the -O2 compiler option.
Sorting 50 million numbers with Threaded Branchless Quicksort ...
0.236s
Multithreading and the avoidance of branch mispredictions can significantly improve the application performance of programs.
christof.kaser@gmail.com