One of the simple topics that helps to think in higher terms rather than a “for” loop but people hide it behind the big name of “functional paradigm” or “functional programming”.

Map

$T []$ and a function $f : T \to U$ to $U []$ .

You can do in-place, which is apparently easier with C.

#include <stddef.h>
 
typedef int (*mapper_func)(int);
 
int twice(int x) {
    return x * 2;
}
 
void map_in_place(int *arr, size_t size, mapper_func f) {
    for (size_t i = 0; i < size; ++i) {
        arr[i] = f(arr[i]);
    }
}
 
int main() {
    int arr[] = {1, 2, 3, 4, 5};
    size_t size = sizeof(arr) / sizeof(arr[0]);
 
    map_in_place(arr, size, twice);
};

Or return a new array.

#include <stddef.h>
#include <stdlib.h>
 
typedef int (*mapper_func)(int);
 
int twice(int x) {
    return x * 2;
}
 
int *map(int *arr, size_t size, mapper_func f) {
    int *result = malloc(size * sizeof(int));
    if (!result) {
        return NULL;  // alloc failure
    }
    for (size_t i = 0; i < size; ++i) {
        result[i] = f(arr[i]);
    }
    return result;
}
 
int main() {
    int arr[] = {1, 2, 3, 4, 5};
    size_t size = sizeof(arr) / sizeof(arr[0]);
 
    int *mapped = map(arr, size, twice);
    // use
    free(mapped);  // free the allocated memory
 
    return 0;
};

Filter

$T []$ and a predicate $p : T \to b oo l$ to $T []$ .

Again, can do in-place.

#include <stddef.h>
 
typedef int (*predicate)(int);
 
int odd(int x) {
    return x & 1;
}
 
int filter_in_place(int *arr, size_t size, predicate f) {
    size_t place_at = 0;
    for (size_t i = 0; i < size; ++i) {
        if (f(arr[i])) {
            arr[place_at++] = arr[i];
        }
    }
    return place_at;                       // new size
}
 
int main() {
    int arr[] = {1, 2, 3, 4, 5};
    size_t size = sizeof(arr) / sizeof(arr[0]);
 
    int new_size = filter_in_place(arr, size, odd);
 
    if(new_size) {
        arr = realloc(arr, new_size * sizeof(int));  // resize the array
    } else {
        free(arr);  // free the original array if no elements remain
        arr = NULL; // set to NULL to avoid dangling pointer
    }
 
    for (size_t i = 0; i < new_size; ++i) {
        printf("%d ", arr[i]);
    }
    printf("\n");
};

Note that this just returns the new size and does not free up any memory used by the unfiltered elements. To free, you need to do a realloc which returns the new pointer to array.

The other way is to return new array, and this is where things get more interesting. The problem being, you don’t know the size of the new array prior. So you may do one of:

Allocate a new array of the same size and then shrink it ( my preferred way )
Do a two-pass ( exact alloc but predicate applied twice so depends on constant time of the predicate )
Do an exponential growth ( like a vector in C++ ) ( could be better if array is large and most of the elements are filtered out )

Note that you need to return both the new array and the size so for this I’ll pass the new size to be used as a pointer.

#include <stddef.h>
#include <stdlib.h>
 
typedef int (*predicate)(int);
 
int odd(int x) {
    return x & 1;
}
 
int* filter(int* arr, size_t size, predicate f, int* new_size) {
    int* new_arr = malloc(size * sizeof(int));
    if (!new_arr) {
        return NULL;  // Memory allocation failed
    }
 
    size_t place_at = 0;
    for (size_t i = 0; i < size; ++i) {
        if (f(arr[i])) {
            new_arr[place_at++] = arr[i];
        }
    }
 
    *new_size = place_at;  // Set the new size
    if (new_size == 0) {
        free(new_arr);  // Free memory if no elements were added
        return NULL;
    } else {
        new_arr = realloc(new_arr, place_at * sizeof(int));  // Resize the array to the new size
    }
 
    return new_arr;
}
 
int main() {
    int arr[] = {1, 2, 3, 4, 5};
    size_t size = sizeof(arr) / sizeof(arr[0]);
 
    int new_size;
    int* new_arr = filter(arr, size, odd, &new_size);
};

Take a guess what sizeof(arr) and what sizeof(new_arr) will be. First is 20 bytes while second is 8. This is due to pointer decay.

How can I make sizeof work again? You can’t. There’s no way to reverse pointer decay. You need to use size and the pointer to the array.

On that note, what will these do?

#include <stdio.h>
 
void filter(int arr[5]) { // can also be just int arr[]
    printf("%d\n", sizeof(arr));
}
 
int main() {
    int arr[5] = {1, 2, 3, 4, 5};
    printf("%d\n", sizeof(arr));
    filter(arr);
}

Decays to pointer in function. Warning on compile about it.

#include <stdio.h>
 
void filter(int arr[120]) {
    printf("%d\n", sizeof(arr));
}
 
int main() {
    int arr[5] = {1, 2, 3, 4, 5};
    printf("%d\n", sizeof(arr));
    filter(arr);
}

Same as previous but with additional warning about size. Array sizes are more of a hint for documentation and static analysis. The compiler simply ignores the 120 here.

A note on realloc would be worth reading here.

Fold

An accumulator $A$ , an array $T []$ , a function $f : A, T \to A$ to $A$ . A generalization of reduce operation where the accumulator is simply the first element and f is $f : T, T \to T$ .

Quite simple to implement.

typedef double (*accumulator)(double, int);
 
double fold(int arr[], int size, accumulator acc, double init) {
    double result = init;
    for (int i = 0; i < size; i++) {
        result = acc(result, arr[i]);
    }
    return result;
}
 
double sum(double acc, int x) {
    return acc + x;
}
 
int main() {
    int arr[5] = {1, 2, 3, 4, 5};
    double result = fold(arr, 5, sum, 0.0);
}

Fold can also be though of a map from $T []$ to $A []$ and then a reduce from $A []$ to $A$ . Note that depending on the operation’s associativity, the result may vary.

Concat

$T []$ and $T []$ to $T []$ . How do I do this in the most efficient way?

Case: New array Can I do better than copying each element? Yes, I can copy memory itself instead of each element.

#include <stdlib.h>
#include <string.h>
 
int* concat(int* arr1, int size1, int* arr2, int size2) {
    int* result = malloc((size1 + size2) * sizeof(int));
    if (!result) {
        return NULL;  // alloc fail
    }
 
    memcpy(result, arr1, size1 * sizeof(int));
    memcpy(result + size1, arr2, size2 * sizeof(int));
 
    return result;
}
 
int main() {
    int arr1[] = {1, 2, 3, 4, 5};
    int arr2[] = {6, 7, 8, 9, 10};
 
    int* result = concat(arr1, sizeof(arr1) / sizeof(arr1[0]), arr2, sizeof(arr2) / sizeof(arr2[0]));
}

memcpy is part of string.h and copies memory from one location to another. Inputs are pointers and bytes. Note that arr1 and arr2 are on the stack and result is on the heap. Result could also have been on stack if you know the size. How does memcpy work then? Well even if it’s on stack, it’s still in memory so for memcpy nothing has changed.

memcpy here returns the pointer to the destination ( the first arg ). Note that this is a common pattern that allows for chaining.

Worth reading What is where in memory.

Case: In place If the array is on stack you can’t do anything. Stack arrays are either fixed at compile time or ( from C99 ) variable length arrays (VLA) that are fixed at runtime declaration. So I consider only heap arrays.

Often it’s good practice to cast the void* returned by malloc and realloc to the int* type but the cast is done implicitly in C so the intention is purely for readability.

#include <stdlib.h>
#include <string.h>
 
void iota(int* arr, int size, int start) {  // from C++
    for (int i = 0; i < size; i++) {
        arr[i] = start + i;
    }
}
 
int main() {
    int* arr1 = (int*)malloc(5 * sizeof(int));
    int* arr2 = (int*)malloc(5 * sizeof(int));
    int size1 = 5, size2 = 5;
 
    if (!arr1 || !arr2) {
        return 1;  // 1 is error code for failure, any non zero does
    }
    iota(arr1, 5, 0);
    iota(arr2, 5, 5);
 
    // concat in place to arr1
    int* new_loc = realloc(arr1, (size1 + size2) * sizeof(int));
    if (!realloc) {
        return 1;  // realloc failed
    }
 
    // note that new_loc can be different but realloc copies the arr1 to the new location, the extended part is uninitialized
    arr1 = new_loc;
    memcpy(arr1 + size1, arr2, size2 * sizeof(int));  // copy arr2 to the end of arr1
}

A quick gotcha on void pointers and memory increments. Guess the output:

#include <stdio.h>
#include <stdlib.h>
 
int main() {
    int* ptr = malloc(10);
    printf("%d\n", ((ptr + 1) - ptr));
    printf("%d\n", ((void*)(ptr + 1) - (void*)ptr));
}

The first one is 1, the second one is 4. This is because the pointer arithmetic takes into account the size of the type. The second one does not.

Reverse

$T []$ to $T []$ . Simple, in place. Note the static inline.

static inline void swap(int* a, int* b) {
    // this is simple & fastest swap, you can xor or sum swap ( or any binary operation with inverse )for "cool" points but
    // they are slower than this one
    int temp = *a;
    *a = *b;
    *b = temp;
}
 
void reverse(int* array, int size) {
    for (int i = 0; i < size / 2; i++) {
        swap(&array[i], &array[size - i - 1]);
    }
}

Shift

This is still $O (n)$ but you can greatly optimize the constant factor.

#include <stdio.h>
#include <string.h>
#include <time.h>
 
#define SIZE 10000000
int array[SIZE];
 
void iota(int *array, int size, int start) {
    for (int i = 0; i < size; i++) {
        array[i] = start + i;
    }
}
 
// Place the inline swap function here
static inline void swap(int *a, int *b) {
    int temp = *a;
    *a = *b;
    *b = temp;
}
 
void shift_loop_swap(int *array, int size) {
    for (int i = size - 1; i > 0; i--) {
        swap(&array[i], &array[i - 1]);
    }
}
 
void shift_loop(int *array, int size) {
    int temp = array[size - 1];
    for (int i = size - 1; i > 0; i--) {
        array[i] = array[i - 1];
    }
    array[0] = temp;
}
 
void shift_memmove(int *array, int size) {
    int temp = array[size - 1];
    memmove(array + 1, array, (size - 1) * sizeof(int));
    array[0] = temp;
}
 
void profile(int *array, int size, void (*shift_func)(int *, int)) {
    double total_time = 0.0;
    int runs = 10;
    for (int r = 0; r < runs; r++) {
        clock_t start = clock();
        shift_func(array, size);
        clock_t end = clock();
        double elapsed = (double)(end - start) / CLOCKS_PER_SEC;
        total_time += elapsed;
    }
    printf("Average: %f ms\n", (total_time / runs) * 1000);
}
 
int main() {
    iota(array, SIZE, 0);
 
    printf("Shift memmove:\n");
    profile(array, SIZE, shift_memmove);
 
    printf("Shift loop:\n");
    profile(array, SIZE, shift_loop);
 
    printf("Shift loop swap:\n");
    profile(array, SIZE, shift_loop_swap);
 
    return 0;
}

For size 1e7 arrays and 109 runs

Shift memmove:
Average: 0.433680 ms
Shift loop:
Average: 10.514980 ms
Shift loop swap:
Average: 20.538020 ms

Swapping is slower since a lot more assignment operations and creation of temporaries.

What if the array is small though? The difference is negligible then though the pattern holds. Why? I can only guess it’s due to better memory locality and hence cache locality. For size 100 and 1000000 runs

Shift memmove:
Average: 0.000311 ms
Shift loop:
Average: 0.000401 ms
Shift loop swap:
Average: 0.000510 ms

If you wanted to do a shift by $k$ you can use a memcpy to a buffer then a memmove followed by a memcpy again. There too you can determine the direction of smaller shift. This is a TODO for me.

Another problem now is that we are using a buffer. What if I want to do a shift by $k$ in place?

3 Reversals

Let the last k elements be “b” and rest “a”. Initially, I have $a ∣ b$ and I want $b ∣ a$ . Note that $re v (a ∣ b) = re v (b) ∣ re v (a)$ . Now if I reverse the individual parts, I get $b ∣ a$ .

Minimum swaps

The idea can be derived from “greedily” as well, as in say I start from the first element and do a cyclic swap of every kth element. In order to mark the position, I multiply each by -1, I stop when I get a negative number.

Though this isn’t the “beatiful” way to get to the solution. The generalised idea comes from permutation cycles and there’s a way to convert a one permuation to another in the minimum number of swaps.

Say I want avoid the last pass of greedy solution that requires me to flip each element. Then I need to know the number of cycles. Formulating this, Consider I’m starting at position $p$ , and after $n$ $k$ shifts, I get to $p$ again. Then, $(p + nk) mod l = p$ . Or,

$p + nk = ql + p$
$nk = ql$
$n = \frac{ql}{k}$

Now the “cycle length” would be where I have n as minimum. Think, what are the minimum jumps after which I’m back to the start. I can only change $q$ , but first let’s convert it into irrudicible form by dividing by $g c d (l, k)$ to get $l^{'}$ and $k^{'}$ .

Now, q must be minimum so that this whole thing is an integer, since we cannot reduce using $l^{'}$ and $k^{'}$ , so $q$ must take out entirey of denominator. Hence, $q = k^{'}$ , and $n = \frac{l}{g c d ( l , k )}$ .

Note that the cycle length is independent of the starting position, so all cycles have the same length ( can also be reasoned from a symmetry argument ). Number of cycles then is $l / n = g c d (l, k)$ .

I’ve made the mistake of trying to relate this to diophantine but let’s just say not every gcd takes you to diophantine. In this case, a solution always exists, it’s not even a diophantine.

Benchmarking,

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
 
#define SIZE 1000000
int array[SIZE];
 
void iota(int* array, int size, int start) {
    for (int i = 0; i < size; i++) {
        array[i] = start + i;
    }
}
 
static inline int gcd(int a, int b) {
    while (b != 0) {
        int t = b;
        b = a % b;
        a = t;
    }
    return a;
}
 
static inline void swap(int* a, int* b) {
    int temp = *a;
    *a = *b;
    *b = temp;
}
 
void profile(int* array, int size, void (*shift_func)(int*, int, int)) {
    double total_time = 0.0;
    int runs = 100;
    for (int r = 0; r < runs; r++) {
        clock_t start = clock();
        shift_func(array, size, SIZE / 2);
        clock_t end = clock();
        double elapsed = (double)(end - start) / CLOCKS_PER_SEC;
        total_time += elapsed;
    }
    printf("Average: %f ms\n", (total_time / runs) * 1000);
}
 
void shift_k_memmove(int* array, int size, int shift) {
    shift = shift % size;
 
    int* buffer = (int*)malloc(shift * sizeof(int));
    if (buffer == NULL) {
        return;
    }
 
    // last k to buffer
    memcpy(buffer, array + size - shift, shift * sizeof(int));
    // shift rest
    memmove(array + shift, array, (size - shift) * sizeof(int));
    // copy buffer to front
    memcpy(array, buffer, shift * sizeof(int));
    free(buffer);
}
 
void shift_k_no_mem_assigns(int* array, int size, int shift) {
    int cycles = gcd(size, shift);
    int cycle_size = size / cycles;
    shift = shift % size;
 
    for (int i = 0; i < cycles; i++) {
        // cyclic shift next for the kth elements
        int j = size - 1 - i;
        int temp = array[j];
        int next = j - shift;
 
        for (int cyc = 0; cyc < cycle_size - 1; cyc++, j -= shift, next -= shift) {
            if (j < 0) j += size;
            if (next < 0) next += size;
 
            array[j] = array[next];
        }
 
        if (j < 0) j += size;
        array[j] = temp;
    }
}
 
void shift_k_no_mem_swaps(int* array, int size, int shift) {
    int cycles = gcd(size, shift);
    int cycle_size = size / cycles;
    shift = shift % size;
 
    for (int i = 0; i < cycles; i++) {
        // cyclic shift next for the kth elements
        int j = size - 1 - i;
        int next = j - shift;
 
        for (int cyc = 0; cyc < cycle_size - 1; cyc++, j -= shift, next -= shift) {
            if (j < 0) j += size;
            if (next < 0) next += size;
 
            swap(&array[next], &array[j]);
        }
    }
}
 
int main() {
    iota(array, SIZE, 0);
 
    printf("Shift k with memmove:\n");
    profile(array, SIZE, shift_k_memmove);
 
    printf("Shift k with no memory assigns:\n");
    profile(array, SIZE, shift_k_no_mem_assigns);
 
    printf("Shift k with no memory swaps:\n");
    profile(array, SIZE, shift_k_no_mem_swaps);
 
    return 0;
}

1e7, 100 runs, shift=half

Shift k with memmove:
Average: 1.168920 ms
Shift k with no memory assigns:
Average: 15.665170 ms
Shift k with no memory swaps:
Average: 20.670650 ms

1e7, 100 runs, shift=1

Shift k with memmove:
Average: 0.445730 ms
Shift k with no memory assigns:
Average: 14.632220 ms
Shift k with no memory swaps:
Average: 30.658750 ms

Memmove can be explained, since in shift 1, you just copy 1 element to buffer and majority is memmove.

The assignment one is similar but what about the swaps? It’s because the iterations of inner loop are effectively halved. For shift 1, it’s $n - 1$ , for shift half, it’s $n /2$ . Do not ignore the -1 in the inner loop, that makes all the difference since it is scaled by the outer loop.

Shuffle

$T []$ to $T []$ .

The Fisher-Yates is incredibly simple and efficient. For each element, you swap with the still unshuffled part ( including itself ).

void shuffle(int* array, int size) {
    for (int i = size - 1; i > 0; i--) {
        int j = rand() % (i + 1);
        swap(&array[i], &array[j]);
    }
}

Now, while this feels correct, if someone were to write a shuffle, they would probably do: for each element, swap it at a random position. And this is wrong.

Say a path as a sequence of swap operations. The number of paths is $n^{n}$ while the number of permutations is $n!$ . Clearly, some paths lead to same permuation, which should be fine as long as it holds for each permutation, in which case $n^{n}$ must be divisible by $n!$ which doesn’t hold as well.

Now even though fisher is $n!$ paths, that alone is not enough for a proof as the unique mapping is not established yet. Though that’s simple as well, the first element can be any of the $n$ elements, the second any of the $n - 1$ and so on.

Sort

This section got so long that I moved it.

Unique

$T []$ to $T []$ with unique elements.

In-place, the ideal way you would want to do this is by a sort + remove duplicates.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
 
#define SIZE 100
int array[SIZE];
 
static int compare(const void* a, const void* b) {
    return (*(int*)a - *(int*)b);
}
 
int unique(int* array, int start, int end) {
    qsort(array + start, end - start + 1, sizeof(int), compare);
    int at = 0;  // pos of last unique element
    for (int i = start + 1; i <= end; i++) {
        if (array[i] != array[at]) {
            at++;
            array[at] = array[i];
        }
    }
    return at + 1;
}
 
int main() {
    srand(time(NULL));
 
    for (int i = 0; i < SIZE; i++) {
        array[i] = rand() % 10;
    }
 
    int new_size = unique(array, 0, SIZE - 1);
    for (int i = 0; i < new_size; i++) {
        printf("%d ", array[i]);
    }
 
    return 0;
}

If you could use more space for better performance, a running hash set is another easy way.

Search

I don’t want to spend time writing about it, it’s broadly covered and quite simple for me. The advanced stuff does not belong here anyway.

Splice

$T []$ and $T []$ . The operation I refer to the one with the same name in javascript. The api that I’ll implement is slightly different ( with insert_count since C varargs do not support length determination ): splice(array, start, delete_count, insert_count, ...items). While I personally think array for the items to insert is better but this impl allows me to use variadic arguments so I’ll use it.

Operations is: At start, delete delete_count elements and insert the rest of the insert_count items.

The idea is:

Deleting more?
- Perform the insertions as an overwrite.
- memmove over the gap created by excess deletion
- realloc to net decreased size
Inserting more?
- realloc to the net increased size
- memmove to the right end
- Perform insertions as a write now, since insertions are more all deletions are done via overwrite.

#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
 
int *splice(int *array, int *size_ref, int start, int delete_count, int insert_count, ...) {
    int size = *size_ref;
 
    va_list args;
    va_start(args, insert_count);
 
    int delta_size = insert_count - delete_count;
    int new_size = size + delta_size;
 
    if (new_size == 0) {
        free(array);
        array = NULL;
        return array;
    }
 
    int to_insert[insert_count];
    for (int i = 0; i < insert_count; i++) {
        to_insert[i] = va_arg(args, int);
    }
 
    if (delta_size == 0) {
        memcpy(array + start, to_insert, insert_count * sizeof(int));
    } else if (delta_size > 0) {
        array = realloc(array, new_size * sizeof(int));  // not handling NULL case
 
        int to_move = size - (start + delete_count);
        // Shift elements: from array + start to array + insert + delta_size
        memmove(array + start + insert_count, array + start + delete_count, to_move * sizeof(int));
        // Insert new elements
        memcpy(array + start, to_insert, insert_count * sizeof(int));
    } else {
        // write the new elements
        memcpy(array + start, to_insert, insert_count * sizeof(int));
        // shift over the gap from excess deletion
        int to_move = size - (start + delete_count);
        memmove(array + start + insert_count, array + start + delete_count, to_move * sizeof(int));
        // Resize the array
        array = realloc(array, new_size * sizeof(int));  // not handling NULL case
    }
 
    va_end(args);
 
    *size_ref = new_size;
    return array;
}
 
void print_array(int *array, int size) {
    printf("size is %d\n", size);
    for (int i = 0; i < size; i++) {
        printf("%d ", array[i]);
    }
    printf("\n");
}
 
int main() {
    int size = 10;
    int *array = malloc(size * sizeof(int));
    for (int i = 0; i < size; i++)
        array[i] = i;
 
    print_array(array, size);
    array = splice(array, &size, 4, 6, 3, 99, 99, 99);
    print_array(array, size);
 
    return 0;
}

Lest I Forget

Notes

Array Transforms

Map