Multithreaded Branch-Avoidant Quicksort

17 min read Original article ↗

A Fast Quicksort in C for Modern CPUs with Threads and Branch‑Avoidant Coding

Multithreading has become increasingly important with stagnating single‑thread performance and widespread multicore CPUs. The challenge is to divide the work across multiple threads in a way that actually improves performance.

This is not possible for every algorithm. However, Quicksort is a classic divide‑and‑conquer algorithm, which makes it well suited for parallelization, since independent subproblems can be processed concurrently.

On modern CPUs, avoiding branch misprediction is another way to speed up programs. One effective way to reduce branch misprediction is to eliminate branches altogether.

for (int i = 0, j = 0; i < 1000; i++) {
    if (numbers[i] < 500) {
        small_numbers[j] = numbers[i];
        j += 1;
    }
}
If the array contains random numbers, the branch outcome becomes difficult for the predictor to learn, and the misprediction rate is typically high, often approaching 50%.

This can be avoided by eliminating the branching.

for (int i = 0, j = 0; i < 1000; i++) {
    small_numbers[j] = numbers[i];
    j += (numbers[i] < 500);
}
While this version introduces an unconditional store, its cost is typically far lower than that of a branch misprediction.

Quicksort in C - Performance Comparison

Performance results naturally depend on the underlying hardware. The following benchmarks show the execution times for sorting 50 million integers using different sorting implementations. The measurements were taken on an Apple M1 system and on an Intel Xeon (E5-2680) system with the -O3 compiler option.

Implementation Apple M1 Intel Xeon
Baseline Quicksort 3.191s 4.953s
std::sort (C++) 1.190s 4.949s
Branch-Avoidant Single-Threaded 0.923s 1.814s
Branch-Avoidant Multi-Threaded 0.243s 0.461s

Baseline Quicksort

This is a simple iterative quicksort written in C using a Lomuto‑style partition.

The pivot is placed directly into its final position during partitioning, and the algorithm always continues with the larger part while storing the smaller one on a manual stack.

To avoid the O(n²) runtime caused by many duplicates, the program switches to heap sort for a segment if it detects a significant bias in the partitioning.

Small ranges are finished using a sorting network.

// SPDX-License-Identifier: MIT
// (c) 2026 christof.kaser@gmail.com
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int

#define NAME "Baseline Quicksort"

#define swap(a, b) do { __typeof__(a) _h = a; a = b; b = _h; } while (0)

TYPE* partition(TYPE* left, TYPE* right);
void sorting_network(TYPE* left, int size);
void heap_sort(TYPE* left, TYPE* right);

void sort(TYPE* data, int len) {

    TYPE* left = data;
    TYPE* stack[64];    // for max 2^32 items
    unsigned sp = 0;
    TYPE* right = data + len - 1;

    while (1) {
        long partsz = right - left;
        if (partsz <= 4) {
            sorting_network(left, partsz);
            goto pop_stack;
        }
        TYPE* mid = partition(left, right);

        if ((mid - left) * 16 < partsz) {
            heap_sort(left, right);
            goto pop_stack;
        }
        if (mid < left + partsz / 2) {
            stack[sp] = mid + 1;
            stack[sp + 1] = right;
            right = mid - 1;
        }
        else {
            stack[sp] = left;
            stack[sp + 1] = mid - 1;
            left = mid + 1;
        }
        sp += 2;
        continue;

    pop_stack:
        if (sp == 0) break;
        sp -= 2;
        left = stack[sp];
        right = stack[sp + 1];
    }
}

// ---------------------- heapsort and insertsort ----------------------------

void heap_sort(TYPE* left, TYPE* right) {

    long n = right - left + 1;
    if (n < 2) return;
    for (long i = n / 2, j, k; ; ) {
        if (i > 0) {
            k = left[--i];
        }
        else {
            n -= 1;
            if (n == 0) return;
            k = left[n];
            left[n] = left[0];
        }
        j = i;
        while (j * 2 + 1 < n) {
            long child = j * 2 + 1;
            if (child + 1 < n && IS_LOWER(left[child], left[child + 1])) child++;
            if (!IS_LOWER(k, left[child])) break;
            left[j] = left[child];
            j = child;
        }
        left[j] = k;
    }
}

#define sort2(a, b) do { \
    if (IS_LOWER(b, a)) swap(a, b); \
} while(0)              \

#define sort3(a, b, c) do { \
    sort2(a, b); sort2(b, c); sort2(a, b); \
} while(0)

#define sort4(a, b, c, d) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); \
} while(0)

#define sort5(a, b, c, d, e) do { \
    sort2(b, c); sort2(d, e); sort2(b, d); \
    sort2(a, c); sort2(a, d); sort2(c, e); \
    sort2(a, b); sort2(c, d); sort2(b, c); \
} while(0)

void sorting_network(TYPE* left, int partsz_min_1) {
    switch (partsz_min_1) {
    case 4:
        sort5(left[0], left[1], left[2], left[3], left[4]);
        break;
    case 3:
        sort4(left[0], left[1], left[2], left[3]);
        break;
    case 2:
        sort3(left[0], left[1], left[2]);
        break;
    case 1:
        sort2(left[0], left[1]);
        break;
    case 0:
        break;
    }
}

// ---------------------- partition ----------------------------

TYPE* partition(TYPE* left, TYPE* right)
{
    TYPE* outerleft = left;
    TYPE* pivp = left + (right - left) / 2;

    TYPE piv = *pivp;
    *pivp = *outerleft;
    TYPE* mid = left;
    left += 1;

    while (left <= right) {
        if (IS_LOWER(*left, piv)) {
            mid++;
            swap(*mid, *left);
        }
        left++;
    }
    *outerleft = *mid;
    *mid = piv;
    return mid;
}

// ---------------------- testing and main -----------------------------------

unsigned chksum;
unsigned hash_el(void *p, int sz) {
    unsigned char* c = (unsigned char*)p;
    unsigned h = 0;
    for (int i = 0; i < sz; i++) h = (h * 131) + c[i];
    return h;
}
void init(TYPE* data, int len) {
    chksum = 0;
    for (int i = 0; i < len; i++) {
        data[i] = rand();
        chksum += hash_el(&data[i], sizeof(TYPE));
    }
}
void test(TYPE* data, int len) {
    unsigned chks = hash_el(&data[0], sizeof(TYPE));;
    for (int i = 1; i < len; i++) {
        if (data[i] < data[i - 1]) {
            printf("ERROR ORDER\n");
            break;
        }
        chks += hash_el(&data[i], sizeof(TYPE));
    }
    if (chks != chksum) printf("ERROR CHKS\n");
}
double ts(void) {
    static double t0;
    struct timeval tv;
    gettimeofday(&tv, NULL);
    double h = t0;
    t0 = tv.tv_sec + tv.tv_usec / 1000000.0;
    return t0 - h;
}

#define SIZE (50 * 1000000)
TYPE data[SIZE];

int main(void) {
    init(data, SIZE);
    printf("Sorting %d million numbers " NAME " ...\n", SIZE / 1000000);
    ts();
    sort(data, SIZE);
    printf("%.3fs\n", ts());
    test(data, SIZE);
    return 0;
}
Sorting 50 million numbers Baseline Quicksort ...
3.186s

C++ std::sort

For comparison C++ std::sort:

#include <algorithm>
#include <sys/time.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int

#define NAME "C++ std::sort"

void sort(TYPE* data, int len) {
    std::sort(data, data + len);
}

// ---------------------- testing and main -----------------------------------
.
.
Sorting 50 million numbers with std::sort ...
1.190s

Branch-Avoidant Single-Threaded

The paper https://arxiv.org/abs/1604.06697 by Edelkamp and A. Weiß shows how partitioning performance in Quicksort can be improved by avoiding conditional branches.

The idea of using auxiliary memory for branchless partitioning is inspired by https://github.com/scandum/fluxsort .

The sorting network was expanded and built on a branchless structure.

// SPDX-License-Identifier: MIT
// (c) 2026 christof.kaser@gmail.com
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int

#define NAME "Branchless Quicksort"

#define swap(a, b) do { __typeof__(a) _h = a; a = b; b = _h; } while (0)

TYPE* partition(TYPE* left, TYPE* right);
void sorting_network(TYPE* left, int size);
void heap_sort(TYPE* left, TYPE* right);
TYPE* partition_small(TYPE* left, TYPE* right);

#define SMALLPART 2048

void sort(TYPE* data, int len) {

    TYPE* left = data;
    TYPE* stack[64];    // for max 2^32 items
    unsigned sp = 0;
    TYPE* right = data + len - 1;

    while (1) {
        long partsz = right - left;
        if (partsz <= 15) {
            sorting_network(left, partsz);
            goto pop_stack;
        }
        TYPE* mid;
        if (partsz <= SMALLPART) mid = partition_small(left, right);
        else mid = partition(left, right);

        if (partsz > SMALLPART && (mid - left) * 16 < partsz) {
            heap_sort(left, right);
            goto pop_stack;
        }
        if (mid < left + partsz / 2) {
            stack[sp] = mid + 1;
            stack[sp + 1] = right;
            right = mid - 1;
        }
        else {
            stack[sp] = left;
            stack[sp + 1] = mid - 1;
            left = mid + 1;
        }
        sp += 2;
        continue;

    pop_stack:
        if (sp == 0) break;
        sp -= 2;
        left = stack[sp];
        right = stack[sp + 1];
    }
}

// ---------------------- heapsort and sorting network ----------------------------

void heap_sort(TYPE* restrict left, TYPE* restrict right) {

    long n = right - left + 1;
    if (n < 2) return;
    for (long i = n / 2, j, k; ; ) {
        if (i > 0) {
            k = left[--i];
        }
        else {
            n -= 1;
            if (n == 0) return;
            k = left[n];
            left[n] = left[0];
        }
        j = i;
        while (j * 2 + 1 < n) {
            long child = j * 2 + 1;
            if (child + 1 < n && IS_LOWER(left[child], left[child + 1])) child++;
            if (!IS_LOWER(k, left[child])) break;
            left[j] = left[child];
            j = child;
        }
        left[j] = k;
    }
}

#define sort2(a, b) do {  \
    __typeof__(a) x = a; \
    __typeof__(a) y = b; \
    int m = IS_LOWER(x, y); \
    a = m ? x : y;  \
    b = m ? y : x;  \
} while(0)

#define sort3(a, b, c) do { \
    sort2(a, b); sort2(b, c); sort2(a, b); \
} while(0)

#define sort4(a, b, c, d) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); \
} while(0)

#define sort5(a, b, c, d, e) do { \
    sort2(b, c); sort2(d, e); sort2(b, d); \
    sort2(a, c); sort2(a, d); sort2(c, e); \
    sort2(a, b); sort2(c, d); sort2(b, c); \
} while(0)

#define sort6(a, b, c, d, e, f) do { \
    sort2(a, b); sort2(c, d); sort2(e, f); \
    sort2(a, c); sort2(b, d); sort2(e, f); \
    sort2(a, e); sort2(b, f); sort2(c, e); \
    sort2(d, f); sort2(b, c); sort2(d, e); \
    sort2(c, d); \
} while(0)

#define sort7(a, b, c, d, e, f, g) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); sort2(e, f); \
    sort2(e, g); sort2(f, g); sort2(a, e); \
    sort2(b, f); sort2(c, g); sort2(b, e); \
    sort2(d, g); sort2(c, e); sort2(b, c); \
    sort2(d, f); sort2(d, e); \
} while(0)

#define sort8(a, b, c, d, e, f, g, h) do { \
    sort7(a, b, c, d, e, f, g); \
    sort2(g, h); sort2(f, g); sort2(e, f); \
    sort2(d, e); sort2(c, d); sort2(b, c); sort2(a, b); \
} while(0)

#define sort9(a, b, c, d, e, f, g, h, i) do { \
    sort8(a, b, c, d, e, f, g, h); \
    sort2(h, i); sort2(g, h); sort2(f, g); sort2(e, f); \
    sort2(d, e); sort2(c, d); sort2(b, c); sort2(a, b); \
} while(0)

#define sort10(a, b, c, d, e, f, g, h, i, j) do { \
    sort9(a, b, c, d, e, f, g, h, i); \
    sort2(i, j); sort2(h, i); sort2(g, h); sort2(f, g); sort2(e, f); \
    sort2(d, e); sort2(c, d); sort2(b, c); sort2(a, b); \
} while(0)

#define sort11(a,b,c,d,e,f,g,h,i,j,k) do { \
    sort10(a,b,c,d,e,f,g,h,i,j); \
    sort2(j,k); sort2(i,j); sort2(h,i); sort2(g,h); sort2(f,g); \
    sort2(e,f); sort2(d,e); sort2(c,d); sort2(b,c); sort2(a,b); \
} while(0)

#define sort12(a,b,c,d,e,f,g,h,i,j,k,l) do { \
    sort11(a,b,c,d,e,f,g,h,i,j,k); \
    sort2(k,l); sort2(j,k); sort2(i,j); sort2(h,i); sort2(g,h); \
    sort2(f,g); sort2(e,f); sort2(d,e); sort2(c,d); sort2(b,c); sort2(a,b); \
} while(0)

#define sort13(a, b, c, d, e, f, g, h, i, j, k, l, m) do { \
    sort12(a, b, c, d, e, f, g, h, i, j, k, l); \
    sort2(l, m); sort2(k, l); sort2(j, k); sort2(i, j); sort2(h, i); \
    sort2(g, h); sort2(f, g); sort2(e, f); sort2(d, e); sort2(c, d); \
    sort2(b, c); sort2(a, b); \
} while(0)

#define sort14(a, b, c, d, e, f, g, h, i, j, k, l, m, n) do { \
    sort13(a, b, c, d, e, f, g, h, i, j, k, l, m); \
    sort2(m, n); sort2(l, m); sort2(k, l); sort2(j, k); sort2(i, j); \
    sort2(h, i); sort2(g, h); sort2(f, g); sort2(e, f); sort2(d, e); \
    sort2(c, d); sort2(b, c); sort2(a, b); \
} while(0)

#define sort15(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o) do { \
    sort14(a, b, c, d, e, f, g, h, i, j, k, l, m, n); \
    sort2(n, o); sort2(m, n); sort2(l, m); sort2(k, l); sort2(j, k); \
    sort2(i, j); sort2(h, i); sort2(g, h); sort2(f, g); sort2(e, f); \
    sort2(d, e); sort2(c, d); sort2(b, c); sort2(a, b); \
} while(0)

#define sort16(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) do { \
    sort15(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o); \
    sort2(o, p); sort2(n, o); sort2(m, n); sort2(l, m); sort2(k, l); \
    sort2(j, k); sort2(i, j); sort2(h, i); sort2(g, h); sort2(f, g); \
    sort2(e, f); sort2(d, e); sort2(c, d); sort2(b, c); sort2(a, b); \
} while(0)

void sorting_network(TYPE* left, int partsz_min_1) {
    switch (partsz_min_1) {
    case 15:
        sort16(left[0], left[1], left[2], left[3], left[4], left[5], left[6], left[7],
            left[8], left[9], left[10], left[11], left[12], left[13], left[14], left[15]);
        break;
    case 14:
        sort15(left[0], left[1], left[2], left[3], left[4], left[5], left[6], left[7],
            left[8], left[9], left[10], left[11], left[12], left[13], left[14]);
        break;
    case 13:
        sort14(left[0], left[1], left[2], left[3], left[4], left[5], left[6], left[7],
            left[8], left[9], left[10], left[11], left[12], left[13]);
        break;
    case 12:
        sort13(left[0], left[1], left[2], left[3], left[4], left[5], left[6], left[7],
            left[8], left[9], left[10], left[11], left[12]);
        break;
    case 11:
        sort12(left[0], left[1], left[2], left[3], left[4], left[5], left[6], left[7],
            left[8], left[9], left[10], left[11]);
        break;
    case 10:
        sort11(left[0], left[1], left[2], left[3], left[4], left[5], left[6], left[7],
            left[8], left[9], left[10]);
        break;
    case 9:
        sort10(left[0], left[1], left[2], left[3], left[4], left[5], left[6], left[7],
            left[8], left[9]);
        break;
    case 8:
        sort9(left[0], left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[8]);
        break;
    case 7:
        sort8(left[0], left[1], left[2], left[3], left[4], left[5], left[6], left[7]);
        break;
    case 6:
        sort7(left[0], left[1], left[2], left[3], left[4], left[5], left[6]);
        break;
    case 5:
        sort6(left[0], left[1], left[2], left[3], left[4], left[5]);
        break;
    case 4:
        sort5(left[0], left[1], left[2], left[3], left[4]);
        break;
    case 3:
        sort4(left[0], left[1], left[2], left[3]);
        break;
    case 2:
        sort3(left[0], left[1], left[2]);
        break;
    case 1:
        sort2(left[0], left[1]);
        break;
    case 0:
        break;
    }
}
// ---------------------- partition ----------------------------

#define med5(a, b, c, d, e) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); sort2(c, e); \
    sort2(b, c); \
} while(0)

TYPE* partition_small(TYPE* restrict  left, TYPE* restrict  right) {

    TYPE* outerleft = left;
    TYPE* pivp = left + (right - left) / 2;

    med5(left[1], left[2], *pivp, right[-1], *right);
    left += 3;
    right -= 2;

    TYPE piv = *pivp;
    *pivp = *outerleft;

    TYPE _Alignas(64) swbuf[SMALLPART];
    TYPE* sw = swbuf;

    TYPE* left0 = left;
    while (left <= right) {
        int h = IS_LOWER(*left, piv); *left0 = *sw = *left++; left0 += h; sw += !h;
    }
    memcpy(left0, swbuf, (sw - swbuf) * sizeof(TYPE));
    left0 -= 1;
    *outerleft = *left0;
    *left0 = piv;
    return left0;
}

TYPE* partition(TYPE* restrict left, TYPE* restrict right) {

    TYPE* outerleft = left;
    int n = right - left;
    TYPE* pivp = left + n / 2;
    TYPE piv;

    TYPE* q1 = left + n / 4;
    TYPE* q3 = pivp + n / 4;

    left += 1;
    med5(left[0], left[1], left[2], left[3], left[4]);
    med5(q1[-2], q1[-1], q1[0], q1[1], q1[2]);
    med5(pivp[-2], pivp[-1], pivp[0], pivp[1], pivp[2]);
    med5(q3[-2], q3[-1], q3[0], q3[1], q3[2]);
    med5(right[-4], right[-3], right[-2], right[-1], right[0]);
    med5(left[2], q1[0], pivp[0], q3[0], right[-2]);

    piv = *pivp;
    *pivp = *outerleft;

    #define SWSZ 1024
    TYPE _Alignas(64) swbuf[SWSZ];

    TYPE* left0 = left;
    TYPE* right0 = right;

    TYPE* sw = swbuf;
    #define UNROLL 16
    while (sw < swbuf + SWSZ - UNROLL && left <= right - UNROLL) {
        for (int i = UNROLL; i--;) {
            int h = IS_LOWER(*right, piv); *right0 = *sw = *right--; right0 -= !h; sw += h;
        }
    }
    while (sw < swbuf + SWSZ - UNROLL && left <= right) {
        int h = IS_LOWER(*right, piv); *right0 = *sw = *right--; right0 -= !h; sw += h;
    }
    while (1) {
        while (right0 > right + UNROLL && left <= right - UNROLL) {
            for (int i = UNROLL; i--;) {
                int h = IS_LOWER(*left, piv); *left0 = *right0 = *left++; left0 += h; right0 -= !h;
            }
        }
        while (left0 < left - UNROLL && left <= right - UNROLL) {
            for (int i = UNROLL; i--;) {
                int h = IS_LOWER(*right, piv); *right0 = *left0 = *right--; right0 -= !h; left0 += h;
            }
        }
        if (left > right - UNROLL) break;
    }
    while (right0 > right && left <= right) {
        int h = IS_LOWER(*left, piv); *left0 = *right0 = *left++; left0 += h; right0 -= !h;
    }
    while (left <= right) {
        int h = IS_LOWER(*right, piv); *right0 = *left0 = *right--; right0 -= !h; left0 += h;
    }
    memcpy(left0, swbuf, (sw - swbuf) * sizeof(TYPE));

    *outerleft = *right0;
    *right0 = piv;
    return right0;
}

// ---------------------- testing and main -----------------------------------
.
.
.
Sorting 50 million numbers with Branchless Quicksort ...
0.923s

Branch-Avoidant Multi-Threaded

The algorithm is extended to use multiple CPU cores.

New threads are created only for large partitions and are limited so that we don’t start too many threads. Smaller partitions are processed in the current thread.

// SPDX-License-Identifier: MIT
// (c) 2026 christof.kaser@gmail.com
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#include <pthread.h>
#include <unistd.h>
#include <stdatomic.h>

#define IS_LOWER(a, b) ((a) < (b))
#define TYPE int

#define NAME "Threaded Branchless Quicksort"

#define swap(a, b) do { __typeof__(a) h = a; a = b; b = h; } while (0)

TYPE* partition(TYPE* left, TYPE* right);
void sorting_network(TYPE* left, int size);
void heap_sort(TYPE* left, TYPE* right);
TYPE* partition_small(TYPE* left, TYPE* right);

#define SMALLPART 2048

int max_threads;
atomic_int n_threads;

pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;

void* sort_thr(void *arg) {

    TYPE** stack = (TYPE**)arg;
    int sp = 0;
    TYPE* left = stack[sp];
    TYPE* right = stack[sp + 1];

    while (1) {
        int partsz = right - left;
        if (partsz <= 15) {
            sorting_network(left, partsz);
            goto pop_stack;
        }
        TYPE* l;
        TYPE* r;

        TYPE* mid;
        if (partsz <= SMALLPART) mid = partition_small(left, right);
        else mid = partition(left, right);

        if (partsz > SMALLPART && mid - left < 100) {
            heap_sort(left, right);
            goto pop_stack;
        }

        if (mid < left + partsz / 2) {
            l = mid + 1;
            r = right;
            right = mid - 1;
        }
        else {
            l = left;
            r = mid - 1;
            left = mid + 1;
        }
        // soft limit: may exceed max_threads
        TYPE** stack_new = NULL;
        if (r - l > 1000000 && n_threads < max_threads) {
            // start a new thread - max_threads is a soft limit
            stack_new = malloc(64 * sizeof(TYPE*));
            // stack size 64 is sufficient for n <= 2^32
            if (stack_new) {
                stack_new[0] = l;
                stack_new[1] = r;

                pthread_mutex_lock(&mtx);
                pthread_t thread;
                if (pthread_create(&thread, NULL, sort_thr, stack_new) == 0) {
                    n_threads += 1;
                    pthread_detach(thread);
                }
                else {
                    free(stack_new);
                    stack_new = NULL;
                }
                pthread_mutex_unlock(&mtx);
            }
        }
        if (stack_new == NULL) {
            stack[sp] = l;
            stack[sp + 1] = r;
            sp += 2;
        }
        continue;

    pop_stack:
        if (sp == 0) break;
        sp -= 2;
        left = stack[sp];
        right = stack[sp + 1];
    }

    free(stack);
    pthread_mutex_lock(&mtx);
    n_threads -= 1;
    if (n_threads == 0) pthread_cond_signal(&cond);
    pthread_mutex_unlock(&mtx);

    return NULL;
}

void sort(TYPE* data, int len) {

    int n_cpus = sysconf(_SC_NPROCESSORS_ONLN);
    // printf("CPUs: %d\n", n_cpus);
    if (n_cpus > 0) max_threads = n_cpus * 2;
    else max_threads = 8;

    for (TYPE* p = data + 1; p < data + len; p++) {
        if (*p < *data) swap(*p, *data);
    }
    pthread_t thread;
    TYPE** stack = malloc(64 * sizeof(TYPE*));
    stack[0] = data + 1;
    stack[1] = data + len - 1;
    n_threads = 1;
    pthread_create(&thread, NULL, sort_thr, stack);

    pthread_mutex_lock(&mtx);
    while (n_threads != 0) pthread_cond_wait(&cond, &mtx);
    pthread_mutex_unlock(&mtx);
}

// ---------------------- heapsort and sorting network ----------------------------
.
.
Sorting 50 million numbers with Threaded Branchless Quicksort ...
0.243s
Multithreading and the avoidance of branch mispredictions can significantly improve the application performance of programs.

Interactive sorting demo


christof.kaser@gmail.com