0

I'm trying to figure out how much I should care about alignment. Here I'm testing some arithmetic using two different buffers. If I set this up right, in the 'wacky' buffer the integer is stored at the 29th byte arbitrarily. In the 'normal' buffer the integer is stored at the 29th 4-byte integer, like any sane array would. I am printing out the results of my tests. The wacky integers are slower, but it actually doesn't matter if I pick 29 or 0 or 1, the performance ratio is about the same. The ratio also doesn't change if compiler optimizations are turned on or off. Is this an accurate representation of the performance cost of doing this? I might be completely confused or missing something here but I'd appreciate if someone could point me in the right direction.

Here's the code:

#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
typedef uint8_t *wacky_int32_t;

#define WACKY_OFFSET 29
wacky_int32_t wacky_int32(int32_t number)
{
        uint8_t *buffer = (uint8_t *)calloc(sizeof(int32_t) + WACKY_OFFSET, sizeof(uint8_t));
        memcpy(buffer + WACKY_OFFSET, &number, sizeof(int32_t));
        return(buffer);
}

static inline int32_t unwacky_int32(wacky_int32_t number)
{
        return(*(int32_t *)(number + WACKY_OFFSET));
}

long perfcount()
{
        struct timespec ts;
        clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
        return(ts.tv_nsec);
}

int main(int argc, char **argv)
{
        int testcount = 0;
        while(testcount++ < 24) {
                int32_t randa = rand();
                int32_t randb = rand();
                int32_t randc = rand();
                int32_t randd = 1 + rand();

                wacky_int32_t a =  wacky_int32(randa);
                wacky_int32_t b =  wacky_int32(randb);
                wacky_int32_t c =  wacky_int32(randc);
                wacky_int32_t d =  wacky_int32(randd);

                int32_t *a2 = (int32_t *)calloc(WACKY_OFFSET+1, sizeof(int32_t));
                int32_t *b2 = (int32_t *)calloc(WACKY_OFFSET+1, sizeof(int32_t));
                int32_t *c2 = (int32_t *)calloc(WACKY_OFFSET+1, sizeof(int32_t));
                int32_t *d2 = (int32_t *)calloc(WACKY_OFFSET+1, sizeof(int32_t));
                a2[WACKY_OFFSET] = randa;
                b2[WACKY_OFFSET] = randb;
                c2[WACKY_OFFSET] = randc;
                d2[WACKY_OFFSET] = randd;

                long start = perfcount();
                int32_t ans = (unwacky_int32(a) + (unwacky_int32(b) * unwacky_int32(c))) % unwacky_int32(d);
                long wackytime = perfcount() - start;
                free(a); free(b); free(c); free(d);

                start = perfcount();
                int32_t ans2 = (a2[WACKY_OFFSET] + (b2[WACKY_OFFSET] * c2[WACKY_OFFSET])) % d2[WACKY_OFFSET];
                long normaltime = perfcount() - start;
                free(a2); free(b2); free(c2); free(d2);

                printf("[wacky mode ] ans = %-16d time = %-16ld\n[normal mode] ans = %-16d time = %-16ld\n\n", 
                        ans, wackytime, ans2, normaltime);
        }
        return 0;
}

Here's some output:

[wacky mode ] ans = -139296355       time = 370             
[normal mode] ans = -139296355       time = 124             

[wacky mode ] ans = 1254173191       time = 134             
[normal mode] ans = 1254173191       time = 127             

[wacky mode ] ans = -428008505       time = 95              
[normal mode] ans = -428008505       time = 91              

[wacky mode ] ans = 1411083651       time = 90              
[normal mode] ans = 1411083651       time = 91              

[wacky mode ] ans = -250251228       time = 88              
[normal mode] ans = -250251228       time = 69              

[wacky mode ] ans = 1670511475       time = 90              
[normal mode] ans = 1670511475       time = 76              

[wacky mode ] ans = -142905250       time = 93              
[normal mode] ans = -142905250       time = 75              

[wacky mode ] ans = 402377226        time = 107             
[normal mode] ans = 402377226        time = 76              

[wacky mode ] ans = -680962320       time = 93              
[normal mode] ans = -680962320       time = 73              

[wacky mode ] ans = -992960967       time = 98              
[normal mode] ans = -992960967       time = 72              

[wacky mode ] ans = 20339958         time = 95              
[normal mode] ans = 20339958         time = 72              

[wacky mode ] ans = -1090114074      time = 95              
[normal mode] ans = -1090114074      time = 78              

[wacky mode ] ans = 170467638        time = 95              
[normal mode] ans = 170467638        time = 76              

[wacky mode ] ans = 102978457        time = 88              
[normal mode] ans = 102978457        time = 73              

[wacky mode ] ans = 96879004         time = 95              
[normal mode] ans = 96879004         time = 78              

[wacky mode ] ans = 941108877        time = 94              
[normal mode] ans = 941108877        time = 76              

[wacky mode ] ans = -3164800         time = 92              
[normal mode] ans = -3164800         time = 72              

[wacky mode ] ans = -73124107        time = 88              
[normal mode] ans = -73124107        time = 73              

[wacky mode ] ans = 759564988        time = 94              
[normal mode] ans = 759564988        time = 76              

[wacky mode ] ans = 103176158        time = 92              
[normal mode] ans = 103176158        time = 78              

[wacky mode ] ans = 1234836399       time = 94              
[normal mode] ans = 1234836399       time = 79              

[wacky mode ] ans = 498712444        time = 89              
[normal mode] ans = 498712444        time = 74              

[wacky mode ] ans = 207578849        time = 97              
[normal mode] ans = 207578849        time = 76              

[wacky mode ] ans = 1447403380       time = 91              
[normal mode] ans = 1447403380       time = 70 
11
  • 5
    On some architectures this will signal a bus error. Commented Feb 3, 2022 at 0:35
  • "how much I should care about alignment" - the best answer for this comes from consulting your compiler's, and microprocessor's documentation before rolling your own benchmarks. Commented Feb 3, 2022 at 0:35
  • If it doesn't signal an error, it might have performance impact. Commented Feb 3, 2022 at 0:37
  • Understanding what alignment is is pretty important. Note that alignment isn't about the offset from the base of an array, but about the base address itself. Standard C allocators have no guarantees about alignment - usually you need to use system-specific aligned allocators. Performance slowdowns occur when a fetch crosses a memory page bound, typically 4096 bytes. e.g. If your four byte integer is split between two pages (its base address is within the last 3 bytes) then the CPU will have to read (and potentially fetch) two pages to read the one integer. This can cause cache thrashing. Commented Feb 3, 2022 at 0:38
  • 3
    @Qix-MONICAWASMISTREATED: “Standard C allocators have no guarantees about alignment”: The C standard disagrees with you. C 2018 7.22.3 1 says “… The pointer returned if the allocation succeeds is suitably aligned so that it may be assigned to a pointer to any type of object with a fundamental alignment requirement and then used to access such an object or an array of such objects in the space allocated (until the space is explicitly deallocated)…” Commented Feb 3, 2022 at 0:41

3 Answers 3

4

The consequence of reading unaligned integers depends very much on the compiler and hardware, i.e. it is implementation dependent and not directly covered by the C standard. It is, however, indirectly covered since the constructs needed to make unaligned access fall in the infamous "undefined behavior" category (which does not exclude compilers from defining a behavior). Thus, the following is a - probably incomplete - account of what may be observed on a given system.

On a 8 bit CPU it will typically not matter at all (the second bullet below may apply, though). On a 16+ bit CPU it will typically have some consequence to do a read which is not aligned to the byte width of the CPU. The possible consequences include:

  • Performance penalty since the operation involves more memory fetches than aligned read.
  • Performance penalty or faulty behavior if the operation breaks assumptions/requirements of the memory cache system.
  • Causing an otherwise atomic operation to become non-atomic (which can be a serious issue if the variable is shared between execution threads, including interrupts).
  • Triggering a CPU error (which will typically terminate the program or enter an error state).

It should be mentioned that unaligned access can happen due to subtle reasons. I have experienced that compiler optimization of initialization of a structure (only with char members) caused a hard fault because the struct was not aligned as the compiler expected.

About the performance measurements in the question: the time spans are very short and may be affected by system interrupts running in between the time readings. To get reliable results, an analysis of average and deviations over many iterations should be made.

That being said, the results seem to show a performance penalty of unaligned access within what could be expected.

Sign up to request clarification or add additional context in comments.

2 Comments

This is a very informative answer, thank you. That's an interesting problem with your structs containing chars. Simply initializing caused a hard fault? Shouldn't a compiler add padding to make sure this doesn't happen?? I've never tried to turn C code into optimized machine code so I'm sure there is a good reason.
@Logan Thank you. As I recall the struct problem, it was in some library code where the memory was originally allocated as a char array and then a pointer to the array was casted to a struct pointer and passed to a function. The problem was that the char array and thus the struct pointer was not 4-byte aligned as the compiler rightfully expected. The function initialized the struct by assigning constants, mostly 0, to the members. 4 of these assignments were optimized into a single 32-bit assignment which was not aligned as it should be. Hence the hard fault.
1

The issue with unaligned access is not that it might be slower.
The issue with unaligned access is not that it might be unpredictably slower.
No, the issue with unaligned access is not that it might not work at all.

Unaligned access is undefined behavior, and undefined behavior is, usually, poison.
And you absolutely can not derive a useful conclusion about undefined behavior by trying it and observing that it seems to work on your machine (today).

If you're trying to write a useful program, you want one that works everywhere, every day, on anyone's machine.
"Works on my machine" is not a useful certification.

6 Comments

It's UB in C/C++. That doesn't mean it can't be reasoned about. ISAs very strictly define what is and is not allowed when it comes to unaligned access. It depends on how (un)portable you want to write code. Portability and usefulness are mutually exclusive unless you're part of the GNU cult. Depends on the code and the intended user.
It's UB in C/C++ And the only tag is [c], so that's exactly what we're talking about here!
@Qix-MONICAWASMISTREATED Despite the common misconception, C is not "structured assembler". If something is "undefined behavior", the code produced by C compiler might do something completely unexpected due to optimizations applied by the compiler. At C language level, Undefined Behavior can not be reasoned about. You can reason about it by examining the assembly output of compiler, but that is valid only as long as you don't change operating system, compiler or compile options.
@Qix And, although it's difficult to discuss, there are different degrees of UB. Calling atoi("123x") is not as bad as unaligned access, which is not as bad as i++ + i++. But these are not generally things that can be reasoned about, and just about everyone has a different tolerance threshold. (Me, I have few qualms about calling atoi("123x"), although I'd never code an unaligned access, for any platform. And I'm sure we'd both eschew i++ + i++.)
Of course there are also examples of behaviors that are formally undefined by C, and that are formally defined by various other standards, such as Posix, or a particular compiler or OS. A poor example is fflush(stdin). There are better ones I'm forgetting.
|
0

Generally, you should care. Use memcpy both ways:

static inline int32_t unwacky_int32(wacky_int32_t number)
{
        int32_t r;
        memcpy(&r, number + WACKY_OFFSET, sizeof(int32_t));
        return r;
}

The compiler should be able to optimize that to produce optimal code, so there's no performace worry, so there is really no reason write non-portable (doesn't work on some CPUs, generates bus error due to alignment) code.

2 Comments

If that's all I have to do to pull misaligned integers out of a byte array without causing fatal or even performance problems, then this is a very convenient and easy fix.
@Logan Indeed. It's very likely that with any optimizations enabled, r here will be in register, and memcpy gets replaced by the most efficient code to pull 4 bytes from unaligned memory address to this register (depends on CPU and the CPU mode the application is running in).

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.