Multithreading is an important issue with stagnating single-thread performance and multicore CPUs. This tutorial tries to show how to distribute a calculation task among threads and thus speed up the calculation.
This is not possible for every algorithm. But Quicksort for example is a Divide and Conquer algorithm. This type of algorithm is usually well suited for parallelization.
This Quicksort is implemented recursively with tail call elimination. This means that after partitioning, one subarray is sorted recursively and the other in the current context in the program loop. The smaller subarray is sorted first so that this recursive implementation is stack-safe (avoids a possible stack overflow).
If the subarray becomes smaller than 30, it switches to insert sorting, as this is faster for small arrays.
package main
import (
"fmt"
"time"
"math/rand"
)
func InsertSort(data[] int) {
i := 1
for i < len(data) {
h := data[i]
j := i - 1
for j >= 0 && h < data[j] {
data[j + 1] = data[j]
j -= 1
}
data[j + 1] = h
i += 1
}
}
func Partition(data[] int) ([] int, [] int) {
data[len(data) / 2], data[0] = data[0], data[len(data) / 2]
piv := data[0]
mid := 0
i := 1
for i < len(data) {
if data[i] < piv {
mid += 1
data[i], data[mid] = data[mid], data[i]
}
i += 1
}
data[0], data[mid] = data[mid], data[0]
if mid < len(data) / 2 {
return data[:mid], data[mid + 1:]
} else {
return data[mid + 1:], data[:mid]
}
}
func QSort(data[] int) {
var data2[] int
for len(data) >= 30 {
data2, data = Partition(data)
QSort(data2)
}
InsertSort(data)
}
func RandomInit(data[] int) {
for i := 0; i < len(data); i++ {
data[i] = rand.Intn(1000000000)
}
}
func TestSorted(data[] int) {
for i := 1; i < len(data); i++ {
if data[i] < data[i - 1] {
fmt.Println("Data is not sorted")
break
}
}
}
func main() {
const LEN = 50000000
var data[LEN] int
RandomInit(data[:])
fmt.Printf("Sorting %d million numbers with Quicksort in Go ...\n",
LEN / 1000000)
start := time.Now()
QSort(data[:])
fmt.Printf("Time: %.2fs\n", time.Since(start).Seconds())
TestSorted(data[:])
}
Sorting 50 million numbers with Quicksort in Go ...
Time: 11.74s
After the partition function, we have two independent arrays that can be sorted in parallel.
Starting a Goroutine, the light Go version of a thread, takes time and space. Therefore we start Goroutines only for larger arrays and if not too many goroutines are active.
With go
the Goroutine is started. A sync.WaitGroup
variable, which is shared between all running Goroutines, is used to count the active Goroutines. When the last Goroutine has finished its work, all the work is done.
package main
import (
"fmt"
"time"
"math/rand"
"sync"
"runtime"
)
.
.
.
func QuSort(data[] int, wg *sync.WaitGroup) {
var data_s[] int
for len(data) >= 30 {
data_s, data = Partition(data)
if len(data_s) > 1000 &&
runtime.NumGoroutine() < runtime.NumCPU() * 8 {
wg.Add(1)
go func(d[] int) {
QuSort(d, wg)
wg.Done()
}(data_s)
} else {
QuSort(data_s, wg)
}
}
InsertSort(data)
}
func QSort(data[] int) {
var wg sync.WaitGroup
QuSort(data, &wg)
wg.Wait()
}
.
.
.
Sorting 50 million numbers with Goroutines ...
Time: 4.57s
The other (virtual) cores of the older Intel i3 CPU helped sorting.
This Quicksort is implemented recursivly with the faster Hoare’s partition scheme. We use the median of three fixed elements for the pivot element. This prevents the O(N²) worst case from occurring with random or sorted data, but not with specially prepared input data. In this case, a good random number generator should be used for the selection of the pivot element.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#define swap(a, b) { int _h = a; a = b; b = _h; }
#define min(a, b) ((a) < (b) ? (a) : (b))
void insert_sort(int* left, int* right) {
// put minimum to left position, so we can save
// one inner loop comparsion for insert sort
for (int* pi = left + 1; pi <= right; pi++) {
if (*pi < *left) {
swap(*pi, *left);
}
}
for (int* pi = left + 2; pi <= right; pi++) {
int h = *pi;
int* pj = pi - 1;
while (h < *pj) {
*(pj + 1) = *pj;
pj -= 1;
}
*(pj + 1) = h;
}
}
#define sort3fast(a, b, c) \
if (b < a) { \
if (c < a) { \
if (c < b) { swap(a, c); } \
else { int h = a; a = b; b = c; c = h;} \
} \
else { swap(a, b); } \
} \
else { \
if (c < b) { \
if (c < a) { int h = c; c = b; b = a; a = h;} \
else { swap(b, c); } \
} \
} \
void partition(int* left0, int* right0, int** l1, int** r1, int** l2, int** r2) {
int* left = left0 + 1;
int* right = right0;
int* mid = left0 + (right0 - left0) / 2;
int piv = *mid;
*mid = *left;
sort3fast(*left0, piv, *right0);
*left = piv;
while (1) {
do left += 1; while(*left < piv);
do right -= 1; while (*right > piv);
if (left >= right) break;
swap(*left, *right);
}
*(left0 + 1) = *right;
*right = piv;
if (right < mid) {
*l1 = left0; *r1 = right - 1;
*l2 = right + 1; *r2 = right0;
}
else {
*l1 = right + 1; *r1 = right0;
*l2 = left0; *r2 = right - 1;
}
}
void qusort(int* left, int* right) {
int *l, *r;
while (right - left >= 50) {
partition(left, right, &l, &r, &left, &right);
qusort(l, r);
}
insert_sort(left, right);
}
void sort(int* data, int len) {
qusort(data, data + len - 1);
}
void init(int* data, int len) {
for (int i = 0; i < len; i++) {
data[i] = rand();
}
}
double t(void) {
static double t0;
struct timeval tv;
gettimeofday(&tv, NULL);
double h = t0;
t0 = tv.tv_sec + tv.tv_usec / 1000000.0;
return t0 - h;
}
void test(int* data, int len) {
for (int i = 1; i < len; i++) {
if (data[i] < data[i - 1]) {
printf("ERROR\n");
break;
}
}
}
void print(int* data, int len) {
for (int i = 0; i < len; i++) {
printf("%d\n", data[i]);
}
}
#define SIZE (50 * 1000000)
int data[SIZE];
int main(void) {
init(data, SIZE);
printf("Sorting %d million numbers with Quicksort ...\n",
SIZE / 1000000);
t();
sort(data, SIZE);
printf("%.2fs\n", t());
test(data, SIZE);
return 0;
}
Sorting 50 million numbers with Quicksort ...
9.48s
For comparison C++ std::sort:
.
.
#include <algorithm>
void sort(int* data, int len) {
std::sort(data, data + len);
}
.
.
Sorting 50 million numbers with std::sort ...
10.21s
The paper https://arxiv.org/abs/1604.06697 by Edelkamp and A. Weiß shows how the performance of partitioning can be improved by avoiding conditional branches.
.
.
void partition(int* left0, int* right0, int** l1, int** r1, int** l2, int** r2) {
int* mid = left0 + (right0 - left0) / 2;
int piv = *mid;
*mid = *(left0 + 1);
sort3fast(*left0, piv, *right0);
*(left0 + 1) = piv;
int *left, *right;
#define BSZ 256
if (right0 - left0 > 2 * BSZ + 3) {
left = left0 + 2;
right = right0 - 1;
int* offl[BSZ];
int* offr[BSZ];
int** ol = offl;
int** or = offr;
do {
if (ol == offl) {
int* pd = left;
do {
*ol = pd;
ol += (piv < *pd);
pd += 1;
}
while (pd < left + BSZ);
}
if (or == offr) {
int* pd = right;
do {
*or = pd;
or += (piv > *pd);
pd -= 1;
}
while (pd > right - BSZ);
}
int min = min(ol - offl, or - offr);
ol -= min;
or -= min;
for (int i = 0; i < min; i++) {
swap(**(ol + i), **(or + i));
}
if (ol == offl) left += BSZ;
if (or == offr) right -= BSZ;
}
while (right - left > 2 * BSZ);
left -= 1;
right += 1;
}
else {
left = left0 + 1;
right = right0;
}
while (1) {
do left += 1; while(*left < piv);
do right -= 1; while (*right > piv);
if (left >= right) break;
swap(*left, *right);
}
*(left0 + 1) = *right;
*right = piv;
if (right < mid) {
*l1 = left0; *r1 = right - 1;
*l2 = right + 1; *r2 = right0;
}
else {
*l1 = right + 1; *r1 = right0;
*l2 = left0; *r2 = right - 1;
}
}
.
.
Sorting 50 million numbers with BlockQuicksort ...
6.24s
We only start a new thread when the array to sort is big enough and when there are not too many threads running.
.
.
#include <pthread.h>
#include <unistd.h>
// cc -O3 -lpthread
int max_threads;
int n_threads;
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
void qusort(int* left, int* right);
void* sort_thr(void *arg) {
int** par = (int**)arg;
qusort(par[0], par[1]);
free(arg);
pthread_mutex_lock(&mutex);
n_threads -= 1;
if (n_threads == 0) pthread_cond_signal(&cond);
pthread_mutex_unlock(&mutex);
return NULL;
}
void qusort(int* left, int* right) {
while (right - left >= 50) {
int *l, *r;
partition(left, right, &l, &r, &left, &right);
if (right - left > 100000 && n_threads < max_threads) {
// start a new thread - max_threads is a soft limit
pthread_t thread;
int** param = malloc(2 * sizeof(int*));
param[0] = left;
param[1] = right;
pthread_mutex_lock(&mutex);
n_threads += 1;
pthread_mutex_unlock(&mutex);
pthread_create(&thread, NULL, sort_thr, param);
left = l;
right = r;
}
else {
qusort(l, r);
}
}
insert_sort(left, right);
}
void sort(int* data, int len) {
int n_cpus = sysconf(_SC_NPROCESSORS_ONLN);
if (n_cpus > 0) max_threads = n_cpus * 2;
else max_threads = 8;
pthread_t thread;
int** param = malloc(2 * sizeof(int*));
param[0] = data;
param[1] = data + len - 1;
n_threads = 1;
pthread_create(&thread, NULL, sort_thr, param);
pthread_mutex_lock(&mutex);
pthread_cond_wait(&cond, &mutex);
pthread_mutex_unlock(&mutex);
}
// todo: check if 'malloc' and 'pthread_create' are successful
.
.
Sorting 50 million numbers with threaded BlockQuicksort ...
2.30s
Multithreading can significantly improve the application performance of programs.
The C-Quicksort version is faster because C is generally faster than Go. Go checks unlike C array boundaries, which takes time. In addition, the C version includes more optimizations.
Go is a language that makes threads comparatively easy to use. Go is a well-designed language and by checking the array boundaries it is also very secure and yet quite fast.
easylang.online - easy programming online